In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def get_data_dict(path, query_list):
    
    data = {}
    with open(path, 'r') as f:
        for line in f:
            
            # Extract data from line.
            query, _, doc_id, score = line.split(" ")
            if ('enwiki' in query) and int(score) > 0:
                # Add count i.e. trr depth'
                count = query.count('/')
                if count == 0:
                    query_doc = query
                else:
                    query_doc = query.split('/')[0]

                # Add query to doc.
                if query_doc not in data:
                    data[query_doc] = {
                        'count': count,
                        'query': {}
                    }

                # Add doc id to query
                if query not in data[query_doc]['query']: 
                    data[query_doc]['query'][query]= []
                data[query_doc]['query'][query].append(doc_id)

                # Add query to query_list.
                if query_doc not in query_list:
                    query_list[query_doc] = []
                if query not in query_list[query_doc]:
                    query_list[query_doc].append(query)
    
    return data, query_list


def build_data_dicts():
    
    manual_path = os.path.join(os.getcwd(), 'data', 'testY2.custom.tree.manual.qrels')
    synthetic_path = os.path.join(os.getcwd(), 'data', 'testY2.custom.tree.synthetic.qrels')
    tagme_path = os.path.join(os.getcwd(), 'data', 'benchmarkY2test-entity-automatic.qrels')
    annotated_path = os.path.join(os.getcwd(), 'data', 'benchmarkY2test-entity-manual.qrels')
    
    data_list = []
    query_list = {}
    for name, path in [('manual', manual_path), ('synthetic', synthetic_path), ('tagme', tagme_path), ('annotated', annotated_path)]:
        
        data, query_list = get_data_dict(path=path, query_list=query_list)
        data_list.append((name, data))


    return data_list, query_list
    

In [3]:
data_list, query_list = build_data_dicts()

In [4]:
data_list

[('manual',
  {'enwiki:Aerobic%20fermentation': {'count': 0,
    'query': {'enwiki:Aerobic%20fermentation': ['enwiki:Adenosine%20triphosphate',
      'enwiki:Alcohol%20dehydrogenase',
      'enwiki:Cytoplasmic%20male%20sterility',
      'enwiki:Glucose%20transporter',
      'enwiki:Glycolysis',
      'enwiki:Maize',
      'enwiki:Nicotinamide%20adenine%20dinucleotide',
      'enwiki:Paleopolyploidy',
      'enwiki:Pyruvate%20decarboxylase',
      'enwiki:Pyruvate%20dehydrogenase',
      'enwiki:Saccharomyces%20eubayanus',
      'enwiki:Saccharomyces%20paradoxus',
      'enwiki:Saccharomyces%20pastorianus',
      'enwiki:Schizosaccharomyces%20pombe',
      'enwiki:Tobacco',
      'enwiki:Trypanosomatida',
      'enwiki:Warburg%20effect'],
     'enwiki:Aerobic%20fermentation/Aerobic%20fermentation%20in%20non-yeast%20species': ['enwiki:Adenosine%20triphosphate',
      'enwiki:Cytoplasmic%20male%20sterility',
      'enwiki:Maize',
      'enwiki:Nicotinamide%20adenine%20dinucleotide',
     

In [5]:
from document_parsing.trec_car_parsing import TrecCarParser
from collections import namedtuple
from building_qrels import get_query

parser = TrecCarParser()
path = '/Users/iain/LocalStorage/coding/github/entity-linking-with-pyspark/data/testY2_custom.bin'


In [6]:
row_list = []

for doc_id in query_list.keys():

    document = parser.get_protobuf_message(path=path, doc_id=doc_id)
    for query in query_list[doc_id]:
        
        row = [doc_id, query]
        
        text = ''
        for document_contents in document.document_contents:
            built_query = get_query(doc_id=doc_id, section_heading_names=document_contents.section_heading_names)
            if query in built_query:
                if len(text) == 0:
                    text += document_contents.text 
                else: 
                    text += '\n' + document_contents.text 
        row.append(text)
        
        for name, data in data_list:
            if doc_id in data:
                links = []
                for q_doc_id in data[doc_id]['query'].keys():
                    if query in q_doc_id:
                        links += data[doc_id]['query'][q_doc_id]
            else:
                links = []
            links = sorted(list(np.unique(links)))
            row.append(links)
        row_list.append(row) 

In [7]:

df = pd.DataFrame(row_list, columns=['doc_id', 'query', 'text', 'manual', 'synthetic', 'tagme', 'annotated'])

df['synthetic_&_tagme'] = df.apply(lambda x: list(set(x['tagme'] + x['synthetic'])), axis=1)

df['manual_count'] = [len(r) for r in df['manual']]
df['synthetic_count'] = [len(r) for r in df['synthetic']]
df['tagme_count'] = [len(r) for r in df['tagme']]
df['annotated_count'] = [len(r) for r in df['annotated']]
df['synthetic_&_tagme_count'] = [len(r) for r in df['synthetic_&_tagme']]

def get_recall(gt, links):
    tp = 0
    fn = 0
    for true_link in gt:
        if true_link in links:
            tp += 1
        else:
            fn += 1
    if (tp+fn) > 0:
        return tp/(tp+fn)
    else:
        return 0.0

def get_precision(gt, links):
    tp = 0
    fp = 0
    for possible_link in links:
        if possible_link in gt:
            tp += 1
        else:
            fp += 1
    if (tp+fp) > 0:
        return tp/(tp+fp)
    else:
        return 0.0
    
def get_F1(P, R):
    if (P == 0) or (R == 0):
        return 0.0
    else:
        return (2*P*R) / (P + R)

    
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    intersection = float(len(s1.intersection(s2)))
    union = float(len(s1.union(s2)))
    if union > 0.0:
        return intersection / union
    else:
        return 0.0

    
for qrels_type in ['manual', 'synthetic', 'tagme', 'annotated', 'synthetic_&_tagme']:
    
    for col in ['manual', 'synthetic', 'tagme', 'annotated', 'synthetic_&_tagme']:
        # Col names
        recall_col = '{}_recall_{}_gt'.format(col, qrels_type)
        precision_col = '{}_precision_{}_gt'.format(col, qrels_type)
        f1_col = '{}_f1_{}_gt'.format(col, qrels_type)
        jaccard_col = '{}_jaccard_{}_gt'.format(col, qrels_type)
    
        df[recall_col] = df.apply(lambda x: get_recall(gt=x[qrels_type], links=x[col]), axis=1)
        df[precision_col] = df.apply(lambda x: get_precision(gt=x[qrels_type], links=x[col]), axis=1)
        df[f1_col] = df.apply(lambda x: get_F1(P=x[precision_col], R=x[recall_col]), axis=1)
        df[jaccard_col] = df.apply(lambda x: jaccard_similarity(list1=x[qrels_type], list2=x[col]), axis=1)

  
print(len(df))

777


In [8]:
df_no_blank = df[(df['manual_count'] > 0) & (df['synthetic_count'] > 0) & (df['annotated_count'] > 0) & (df['tagme_count'] > 0)]
print(len(df_no_blank))

131


In [9]:
leaf_list = []
for query in df_no_blank['query'].to_list():
    is_leaf = True
    for sub in  df_no_blank['query'].to_list():
        if query != sub:
            if (query in sub):
                is_leaf = False
    if is_leaf:
        leaf_list.append(query)

df_no_blank_leaf = df_no_blank[df_no_blank['query'].isin(leaf_list)]
print(len(df_no_blank_leaf))

95


In [11]:
write_path = os.path.join(os.getcwd(), 'data', 'df_new_3.csv')
df_no_blank_leaf.to_csv(write_path)

In [None]:

gt =['a', 'b', 'c']
links = ['d', 'a', 'w']



get_recall(gt, links)

In [24]:
df_no_blank_leaf[df['query'] == 'enwiki:Blockchain/Uses/The%20Big%20Four']['manual'].iloc[0]

  """Entry point for launching an IPython kernel.


['enwiki:Big%20Four%20accounting%20firms',
 'enwiki:Cryptocurrency%20wallet',
 'enwiki:Deloitte',
 'enwiki:Ernst%20&%20Young',
 'enwiki:KPMG',
 'enwiki:PricewaterhouseCoopers']

In [39]:
query = 'enwiki:Salmon%20run/Background'
gt = ['enwiki:Animal%20migration', 'enwiki:Atlantic%20salmon', 'enwiki:Egg', 'enwiki:Fish%20migration', 'enwiki:Salmon', 'enwiki:Shoaling%20and%20schooling', 'enwiki:Gravel', 'enwiki:Spawn%20(biology)', 'enwiki:Plankton', 'enwiki:Fish']

for qrels_type in ['manual', 'synthetic', 'tagme', 'annotated', 'synthetic_&_tagme']:
    links = df_no_blank_leaf[df_no_blank_leaf['query'] == query][qrels_type].iloc[0]
    R = get_recall(gt, links)
    P = get_precision(gt, links)
    f1 = get_F1(P, R)
    jaccard = jaccard_similarity(gt, links)
    print(qrels_type)
    print('{}\t{}\t{}\t{}'.format(R, P, f1, jaccard))

manual
0.8	0.6153846153846154	0.6956521739130435	0.5333333333333333
synthetic
0.8	0.6153846153846154	0.6956521739130435	0.5333333333333333
tagme
0.7	0.25	0.3684210526315789	0.22580645161290322
annotated
0.7	0.4375	0.5384615384615384	0.3684210526315789
synthetic_&_tagme
1.0	0.3225806451612903	0.4878048780487805	0.3225806451612903
