In [1]:
import os
import requests
import json
import pandas as pd
import ast
import math

**funcs from notebooks/explorations/nel__spacy_entity_linker.ipynb**

In [2]:
def call_semantic_similarity(input_file, url):
    file_name = os.path.basename(input_file)
    files = {
        'file': (file_name, open(input_file, mode='rb'), 'application/octet-stream')
    }
    resp = requests.post(url, files=files, params={'similarity_types': 'all'})
    s = json.loads(resp.json())
    return pd.DataFrame(s)

In [3]:
def get_wikidata_description(qid):
    # Endpoint URL for the Wikidata Query Service
    endpoint_url = "https://query.wikidata.org/sparql"
    
    # SPARQL query to get the description of an item by its QID
    query = f"""
    SELECT ?itemDescription WHERE {{
        wd:{qid} schema:description ?itemDescription.
        FILTER(LANG(?itemDescription) = "en")
    }}
    """
    
    # The headers to indicate that the response should be in JSON format
    headers = {
        "Accept": "application/sparql-results+json"
    }
    
    # Make the request to the Wikidata Query Service
    response = requests.get(endpoint_url, headers=headers, params={'query': query})
    
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        descriptions = data.get("results", {}).get("bindings", [])
        if descriptions:
            # Return the description text
            return descriptions[0]["itemDescription"]["value"]
        else:
            return "No description found."
    else:
        return "Failed to fetch data."

# Example usage
# Replace 'QID' with the actual QID you want to query, for example, 'Q42' for Douglas Adams
# print(get_wikidata_description('QID'))

**Example with test file**

In [4]:
SIM_API = 'https://kgtk.isi.edu/similarity_api'

df = call_semantic_similarity('../../notebooks/explorations/test_file.csv', SIM_API)

df['q2_description'] = df['q2'].apply(get_wikidata_description)

df[['q1', 'q2', 'q1_label', 'q2_label', 'class',  'jc', 'q2_description']]

Unnamed: 0,q1,q2,q1_label,q2_label,class,jc,q2_description
0,Q1875633,Q1875633,aviation fuel,aviation fuel,1.0,1.0,propellents used to power aircraft or aviation...
1,Q1875633,Q42501,aviation fuel,combustible matter,0.684539,0.885428,any material that stores energy that can later...
2,Q1875633,Q15766923,aviation fuel,Fuel,0.029833,0.062413,scientific journal
3,Q1875633,Q5507117,aviation fuel,Fuel,0.0,0.0,short-lived Bay Area post-hardcore musical act
4,Q1875633,Q35120,aviation fuel,entity,0.003065,0.042554,"anything that can be considered, discussed, or..."


**Get own data**

In [45]:
gold_df = pd.read_csv('../../OMIn_dataset/gold_standard/processed/nel.csv')
gold_df.head()

Unnamed: 0,id,sample,entity,qid
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['ACFT', None, None]","['Q11436', None, None]"
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['DITCH', None, None]","['Q2048319', None, None]"
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['TREE', None, None]","['Q10884', None, None]"
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['LOST CONTROL', None, None]","[None, None, None]"
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","['TAKEOFF', None, None]","['Q854248', None, None]"


In [199]:
gold_df[gold_df['id']=='19890723054159I']

Unnamed: 0,id,sample,entity,qid
192,19890723054159I,AFTER LANDING IT WAS DETERMINED THERE WAS INSU...,"['LANDING', None, None]","['Q844947', None, None]"
193,19890723054159I,AFTER LANDING IT WAS DETERMINED THERE WAS INSU...,"['SEATS', None, None]","['Q2207370', None, None]"
194,19890723054159I,AFTER LANDING IT WAS DETERMINED THERE WAS INSU...,"['OXYGEN MASKS', None, None]","['Q1890958', None, None]"
195,19890723054159I,AFTER LANDING IT WAS DETERMINED THERE WAS INSU...,"['PASSENGERS', None, None]","['Q319604', None, None]"


Paths:\
result_data_path = '../../tool_results/spacy_entity_linker/spacy_entitylinker.csv'\
result_data_path = '../../tool_results/refined/refined.csv'\
result_data_path = '../../tool_results/genre/genre_independent.csv'\
result_data_path = '../../tool_results/blink/blink_results_new.csv'

In [2]:
result_data_path = '../../tool_results/genre/genre_independent.csv'
result_df = pd.read_csv(result_data_path)
result_df.head()

Unnamed: 0,c5_unique_id,c119_text,c119_output,c119_score,entities,titles,ids
0,19900425011659A,RAN OUT OF FUEL ON FERRY FLIGHT. LEFT PONTOON ...,RAN OUT OF { FUEL } [ El Salvador ] ON FERRY F...,-0.200809,FUEL,El Salvador,Q792
1,19900425011659A,RAN OUT OF FUEL ON FERRY FLIGHT. LEFT PONTOON ...,RAN OUT OF { FUEL } [ El Salvador ] ON FERRY F...,-0.200809,FUEL,El Salvador,Q792
2,19900425011659A,RAN OUT OF FUEL ON FERRY FLIGHT. LEFT PONTOON ...,RAN OUT OF { FUEL } [ El Salvador ] ON FERRY F...,-0.200809,FUEL,El Salvador,Q792
3,20000625032189I,(-23)NORTHWEST AIRLINES DC-10 BOEING PUSHED BA...,(-23)NORTHWEST AIRLINES { DC-10 } [ Douglas DC...,-0.109136,DC-10,Douglas DC-10,Q208075
4,20000625032189I,(-23)NORTHWEST AIRLINES DC-10 BOEING PUSHED BA...,(-23)NORTHWEST AIRLINES { DC-10 } [ Douglas DC...,-0.109136,B-52,Boeing B-52 Stratofortress,Q174534


Note that:
- Blink and SpacyEntity Linker list entities as their Wikidata titles, and have seperate mentions columns that contain the literal mention from the text that was recognized as an entity. Therefore, when processing these, ent_col should be set to mentions
- ReFinED and GENRE list the actual mention from the text as the entity and have a title column with the Wikidata title

**Define Functions**

In [7]:
def is_match(ent1, ent2, matching):
    ''' Returns True if the entities match.
    matching may be "STRONG" or "WEAK".
    A strong match is an exact match.
    A weak match is where ent1 is found in ent2 or ent2 is found in ent1'''

    if type(ent1) != str or type(ent2) != str:
        return False
    
    if matching == "STRONG":
        return ent1 == ent2
    elif matching == "WEAK":
        return any([ent1 in ent2, ent2 in ent1])
    else:
        print("Error: matching must be 'STRONG' or 'WEAK'")
        return None

In [93]:
def find_match(gs_entities, tool_entities, matching, gold_set):
    ''' Returns (-1,-1) if no version of the gs_entity at hand is present in tool_entities.
    If a version of the gs_entity is present in tool entities, it returns the index of the
    gs_entity that matched it and the index in tool_entities of the matching entity in a tuple.
    Also uses weak matching if specified'''

    tool_entities = pd.Series(tool_entities)
    stop_idx = len(gs_entities) if gold_set == "EXTENDED" else 1

    for gold_idx in range(stop_idx):
        matches = tool_entities[tool_entities.apply(is_match, ent2 = gs_entities[gold_idx], matching=matching)]
        
        if len(matches) > 0:
            found_idx_ent = (gold_idx, gs_entities[gold_idx])
            return (gold_idx, matches.index.to_list()[0])
    
    return (-1,-1)

In [89]:
def prune_gold_set(gs_entities, gs_qids, gold_set, fill_in_qids):

    if fill_in_qids and None in gs_qids:
        none_idx = gs_qids.index(None)
        if none_idx == 0 and gs_qids[1] != None:
            gs_qids[0] = gs_qids[1]
        elif none_idx == 0 and gs_qids[2] != None:
            gs_qids[0] = gs_qids[2]
            gs_qids[1] = gs_qids[2]
        elif none_idx == 1 and gs_qids[2] != None:
            gs_qids[1] = gs_qids[2]
    
    stop_idx = 3 if gold_set == "EXTENDED" else 1
    valid_data = pd.DataFrame({'ents':gs_entities, 'qids':gs_qids}).iloc[:stop_idx].dropna()
    
    return (valid_data['ents'].to_list(), valid_data['qids'].to_list())

In [148]:
def calculate_precision_recall_f1(gs, df_tool, id_col, ent_col, qid_col, matching='WEAK', gold_set='PRIMARY', fill_in_qids=False):
    """
    Calculate precision and recall based on entities comparison between gs (ground truth) and df_tool (answers).
    
    Parameters:
    - gs: DataFrame with columns ['id', 'sample', 'entities','qids'] representing the ground truth.
    - df_tool: DataFrame with columns ['id', 'sample', 'entities', 'qids'] representing the tool's answers.
    - id_col, ent_col, and qid_col are the column names used in df_tool for the docid, entities (the mentions
    from the text, not Wikidata entity titles), and the QIDs, respectively.
    - matching may be "WEAK" or "STRONG". Strong matching counts an entity-link pair as correct if the entity
    exactly matches the entity in the gold standard, and the links are the same. Weak matching counts it as
    correct if the entity overlaps with the entity in the gold standard, and the links are the same.
    - gold_set may be "PRIMARY" or "EXTENDED". The primary set of gold standard entity-link pairs are those in
    the columns beginning with "primary" in the gold standard. The extended gold standard includes secondary
    and tertiary entity-link pairs, which attempt to account for variability in entity-tagging by providing correct
    links for other possible spans for each entity where applicable.
    
    Returns:
    - A tuple containing precision and recall.
    """
    TP = 0  # True Positives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for index, gs_row in gs.iterrows():
        gs_id, gs_entities, gs_qids = gs_row['id'], ast.literal_eval(gs_row['entity']), ast.literal_eval(gs_row['qid'])
        # Remove gs ent/qid pairs for which there is the ent or qid is None
        # fill_in_qids replaces qids of None to match the next qid in the extended set (always that of a more "general" entity), if there is one.
        gs_entities, gs_qids = prune_gold_set(gs_entities, gs_qids, gold_set, fill_in_qids)
        if len(gs_entities) == 0:
            continue

        selected_rows = df_tool[df_tool[id_col] == gs_id][qid_col].dropna().index # select rows in df_tool which have same docid as gsid, and there is a QID for the entity in the row
        tool_entities = [entity.upper() for entity in df_tool.loc[selected_rows][ent_col]] # get all the entities the tool generated for the gs_id entry
        tool_qids = [qid for qid in df_tool.loc[selected_rows][qid_col]] # get all the entities the tool generated for the gs_id entry

        # Check for False Negative (Gold Standard ent does not appear in tool output)
        gs_match_idx, tool_match_idx = find_match(gs_entities, tool_entities, matching, gold_set)
        if gs_match_idx == -1:
            FN += 1

        # Check for True and False Positives (based on link correctness)
        else:
            start_idx = 0 if gold_set == "EXTENDED" else gs_match_idx # with the extended gold set, if the matching entity is
                                                                        # a secondary or tertiary entity, the more primary QID's are still correct
                                                                        # since they are simply more specific and context-aware.
            if tool_qids[tool_match_idx] in gs_qids[start_idx:gs_match_idx+1]:
                TP += 1
            else:
                FP += 1        

        # Note: We only evaluate the set of entity-link pairs where the entity is present in the gold standard.
        # Note: An incorrect link is counted the same as a missing one if there is a correct link in the gold standard
            
    
    # Calculate precision and recall
    print(f"TP ={TP}, FP={FP}, FN={FN}")
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # Calculating the F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

**Get Scores**

In [205]:
calculate_precision_recall_f1(gold_df, result_df, 'c5_unique_id', 'entities', 'ids', matching="STRONG", gold_set="EXTENDED", fill_in_qids=True)

TP =0, FP=5, FN=444


(0.0, 0.0, 0)

### Semantic Similarity Eval

In [170]:
def match_gold_pred(gs, df_tool, id_col, ent_col, qid_col, matching, gold_set, fill_in_qids):
    
    id = []
    tool_ent = []
    gold_ent = []
    q1_gold = []
    q2_pred = []
    
    for index, gs_row in gs.iterrows():
        gs_id, gs_entities, gs_qids = gs_row['id'], ast.literal_eval(gs_row['entity']), ast.literal_eval(gs_row['qid'])
        # Remove gs ent/qid pairs for which there is the ent or qid is None
        # fill_in_qids replaces qids of None to match the next qid in the extended set (always that of a more "general" entity), if there is one.
        gs_entities, gs_qids = prune_gold_set(gs_entities, gs_qids, gold_set, fill_in_qids)
        if len(gs_entities) == 0:
            continue
        
        selected_rows = df_tool[df_tool[id_col] == gs_id][qid_col].dropna().index # select rows in df_tool which have same docid as gsid, and there is a QID for the entity in the row
        tool_entities = [entity.upper() for entity in df_tool.loc[selected_rows][ent_col]] # get all the entities the tool generated for the gs_id entry
        tool_qids = [qid for qid in df_tool.loc[selected_rows][qid_col]] # get all the entities the tool generated for the gs_id entry
    
        # Find matching gold standard and output entity-link pair if present
        gs_match_idx, tool_match_idx = find_match(gs_entities, tool_entities, matching, gold_set)
        if gs_match_idx == -1:
            continue
    
        # Append to arrays as appropriate
        id.append(gs_id)
        gold_ent.append(gs_entities[gs_match_idx])
        tool_ent.append(tool_entities[tool_match_idx])
        q1_gold.append(gs_qids[gs_match_idx])
        q2_pred.append(tool_qids[tool_match_idx])

    return id, tool_ent, gold_ent, q1_gold, q2_pred

In [116]:
#id, tool_ent, gold_ent, q1_gold, q2_pred = match_gold_pred(gold_df, result_df, 'c5_id','mentions','qids')

In [117]:
def make_temp(q1_gold, q2_pred):
    temp = pd.DataFrame({'q1\tq2':[f"{q1_gold[i]}\t{q2_pred[i]}" for i in range(len(q1_gold))]})
    temp.to_csv('temp.csv',index=False) # create file to feed to call_semantic_similarity()

In [104]:
def retrieve_score_vals(i, col, score_df, eval_df):
    rows = score_df[(score_df['q1'] == eval_df['gold_qid'].iat[i]) & (score_df['q2'] == eval_df['pred_qid'].iat[i])]
    output = list(rows[col])
    if len(output) > 0:
        return output[0]
    else:
        return None

In [128]:
def get_class_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred):

    # organize results
    eval_df = pd.DataFrame({'id':id, 'pred_ent':tool_ent, 'gold_ent':gold_ent,'gold_qid':q1_gold,'pred_qid':q2_pred, 'gold_label':range(len(id)), 'pred_label':range(len(id)),'class':range(len(id)), 'jc':range(len(id))})
    eval_df['gold_label'] = eval_df['gold_label'].apply(retrieve_score_vals, col='q1_label', score_df=score_df, eval_df=eval_df)
    eval_df['pred_label'] = eval_df['pred_label'].apply(retrieve_score_vals, col='q2_label', score_df=score_df, eval_df=eval_df)
    eval_df['class'] = eval_df['class'].apply(retrieve_score_vals, col='class', score_df=score_df, eval_df=eval_df)

    # Get highest scoring gold_qid-pred_qid for each pred_ent

    class_rows_to_keep = []
    
    for id in eval_df['id'].unique():
        for ent in eval_df[eval_df['id']==id]['pred_ent'].unique():
            rows = eval_df[(eval_df['id']==id) & (eval_df['pred_ent']==ent)]
            
            class_scores = [score for score in rows['class'] if score != None and score != "" and not(math.isnan(score))]
            if len(class_scores) > 0:
                idx = list(rows.index)[list(rows['class']).index(max(class_scores))]
                class_rows_to_keep.append(idx)

    class_score = eval_df.loc[class_rows_to_keep]['class'].dropna().mean()

    return class_score

In [132]:
def get_jc_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred):

    # organize results
    eval_df = pd.DataFrame({'id':id, 'pred_ent':tool_ent, 'gold_ent':gold_ent,'gold_qid':q1_gold,'pred_qid':q2_pred, 'gold_label':range(len(id)), 'pred_label':range(len(id)),'class':range(len(id)), 'jc':range(len(id))})
    eval_df['gold_label'] = eval_df['gold_label'].apply(retrieve_score_vals, col='q1_label', score_df=score_df, eval_df=eval_df)
    eval_df['pred_label'] = eval_df['pred_label'].apply(retrieve_score_vals, col='q2_label', score_df=score_df, eval_df=eval_df)
    eval_df['jc'] = eval_df['jc'].apply(retrieve_score_vals, col='jc', score_df=score_df, eval_df=eval_df)

    # Get highest scoring gold_qid-pred_qid for each pred_ent

    jc_rows_to_keep = []

    for id in eval_df['id'].unique():
        for ent in eval_df[eval_df['id']==id]['pred_ent'].unique():
            rows = eval_df[(eval_df['id']==id) & (eval_df['pred_ent']==ent)]
            
            jc_scores = [score for score in rows['jc'] if score != None and score != "" and not(math.isnan(score))]
            if len(jc_scores) > 0:
                idx = list(rows.index)[list(rows['jc']).index(max(jc_scores))]
                jc_rows_to_keep.append(idx)

    jc_score = eval_df.loc[jc_rows_to_keep]['jc'].dropna().mean()

    return jc_score

In [157]:
def calculate_class_jc(gold_df, result_df, id_col, ent_col, qid_col, matching="STRONG",gold_set="PRIMARY",fill_in_qids=False, url='https://kgtk.isi.edu/similarity_api'):
    
    # Get all entity-link pair candidates for evaluation
    id, tool_ent, gold_ent, q1_gold, q2_pred = match_gold_pred(gold_df, result_df, id_col,ent_col,qid_col, matching, gold_set, fill_in_qids)
    
    # Call API
    make_temp(q1_gold, q2_pred)
    score_df = call_semantic_similarity('temp.csv', url)
    os.remove('temp.csv')
    
    # Get Scores
    class_score = get_class_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred)
    jc_score = get_jc_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred)

    return class_score, jc_score

In [126]:
#id, tool_ent, gold_ent, q1_gold, q2_pred = match_gold_pred(gold_df, result_df, 'c5_id','mentions','qids')
#make_temp(q1_gold, q2_pred)
#score_df = call_semantic_similarity('temp.csv', 'https://kgtk.isi.edu/similarity_api')
#score_df

In [208]:
calculate_class_jc(gold_df, result_df, 'c5_unique_id','entities','ids', matching="STRONG",gold_set="PRIMARY",fill_in_qids=False)

(0.113201797375, 0.2440078905)