In [1]:
import os
import requests
import json
import pandas as pd
import ast

**funcs from notebooks/explorations/nel__spacy_entity_linker.ipynb**

In [2]:
def call_semantic_similarity(input_file, url):
    file_name = os.path.basename(input_file)
    files = {
        'file': (file_name, open(input_file, mode='rb'), 'application/octet-stream')
    }
    resp = requests.post(url, files=files, params={'similarity_types': 'all'})
    s = json.loads(resp.json())
    return pd.DataFrame(s)

In [3]:
def get_wikidata_description(qid):
    # Endpoint URL for the Wikidata Query Service
    endpoint_url = "https://query.wikidata.org/sparql"
    
    # SPARQL query to get the description of an item by its QID
    query = f"""
    SELECT ?itemDescription WHERE {{
        wd:{qid} schema:description ?itemDescription.
        FILTER(LANG(?itemDescription) = "en")
    }}
    """
    
    # The headers to indicate that the response should be in JSON format
    headers = {
        "Accept": "application/sparql-results+json"
    }
    
    # Make the request to the Wikidata Query Service
    response = requests.get(endpoint_url, headers=headers, params={'query': query})
    
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        descriptions = data.get("results", {}).get("bindings", [])
        if descriptions:
            # Return the description text
            return descriptions[0]["itemDescription"]["value"]
        else:
            return "No description found."
    else:
        return "Failed to fetch data."

# Example usage
# Replace 'QID' with the actual QID you want to query, for example, 'Q42' for Douglas Adams
# print(get_wikidata_description('QID'))

**Example with test file**

In [4]:
SIM_API = 'https://kgtk.isi.edu/similarity_api'

df = call_semantic_similarity('../../notebooks/explorations/test_file.csv', SIM_API)

df['q2_description'] = df['q2'].apply(get_wikidata_description)

df[['q1', 'q2', 'q1_label', 'q2_label', 'class',  'jc', 'q2_description']]

ConnectTimeout: HTTPSConnectionPool(host='kgtk.isi.edu', port=443): Max retries exceeded with url: /similarity_api?similarity_types=all (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7ff41f619dd0>, 'Connection to kgtk.isi.edu timed out. (connect timeout=None)'))

**Get own data**

In [354]:
gold_df = pd.read_csv('../../gold_standard/processed/nel.csv')
gold_df.head()

Unnamed: 0,id,sample,entity,qid
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['ACFT', None, None]","['Q11436', None, None]"
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['DITCH', None, None]","['Q2048319', None, None]"
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['TREE', None, None]","['Q10884', None, None]"
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"['LOST CONTROL', None, None]","[None, None, None]"
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","['TAKEOFF', None, None]","['Q854248', None, None]"


Paths:\
result_data_path = '../../data/results/spacy_entity_linker/spacy_entitylinker.csv'\
result_data_path = '../../data/results/refined/refined.csv'\
result_data_path = '../../data/results/genre/genre_independent.csv'\
result_data_path = '../../data/results/blink/blink_results_new.csv'

In [355]:
result_data_path = result_data_path = '../../data/results/spacy_entity_linker/spacy_entitylinker.csv'
result_df = pd.read_csv(result_data_path)
result_df.head()

Unnamed: 0,c5_id,c119_input,raw_results,mentions,entities,qids,descriptions
0,19750315005389A,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,"[{'mention': PRIOR, 'identifier': 8015236, 'la...",PRIOR,William Matthew Prior,8015236,American painter
1,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[{'mention': TOW, 'identifier': 6588629, 'labe...",TOW,Toledo Airport (Brazil),6588629,
2,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[{'mention': TOW, 'identifier': 6588629, 'labe...",AIRBORNE,Airborne,4698432,controversial dietary supplement
3,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[{'mention': TOW, 'identifier': 6588629, 'labe...",THOUGHT,agency,3951828,capacity of an agent to act in a world
4,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[{'mention': TOW, 'identifier': 6588629, 'labe...",TOW,Toledo Airport (Brazil),6588629,


Note that:
- Blink and SpacyEntity Linker list entities as their Wikidata titles, and have seperate mentions columns that contain the literal mention from the text that was recognized as an entity. Therefore, when processing these, ent_col should be set to mentions
- ReFinED and GENRE list the actual mention from the text as the entity and have a title column with the Wikidata title

**Define Functions**

In [153]:
def find_pair(gs_entities, tool_entities):
    ''' Returns None if no version of the gs_entity at hand is present in tool_entities.
    If a version of the gs_entity is present in tool entities, it returns the index of the
    gs_entity that matched it and the index in tool_entities of the matching entity in a tuple.'''

    found_ent = None
    
    for ient, ent in enumerate(gs_entities):
        if ent in tool_entities:
            found_ent = (ient, ent)
            break
            
    if found_ent:
        return (found_ent[0], tool_entities.index(found_ent[1]))
    else:
        return (-1,-1)

In [154]:
def calculate_precision_recall_f1(gs, df_tool, id_col, ent_col, qid_col, strict=False):
    """
    Calculate precision and recall based on entities comparison between gs (ground truth) and df_tool (answers).
    
    Parameters:
    - gs: DataFrame with columns ['id', 'sample', 'entities','qids'] representing the ground truth.
    - df_tool: DataFrame with columns ['id', 'sample', 'entities', 'qids'] representing the tool's answers.
    
    Returns:
    - A tuple containing precision and recall.
    """
    TP = 0  # True Positives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    for index, gs_row in gs.iterrows():
        gs_id, gs_entities, gs_qids = gs_row['id'], ast.literal_eval(gs_row['entity']), ast.literal_eval(gs_row['qid'])
        tool_entities = [entity.upper() for entity in df_tool.loc[df_tool[id_col] == gs_id, ent_col].tolist()] # get all the entities the tool generated for the gs_id entry
        tool_qids = [qid for qid in df_tool.loc[df_tool[id_col] == gs_id, qid_col].tolist()] # get all the entities the tool generated for the gs_id entry

        # In strict evaluation, only use the primary entity-link pair in the gold standard
        if strict:
            # Check for False Negative (Gold Standard ent does not appear in tool output)
            tool_match_idx = -1
            try:
                tool_match_idx = tool_entities.index(gs_entities[0])
            except:
                FN += 1
    
            # Check for True and False Positives (based on link correctness)
            if tool_match_idx > -1:
                if tool_qids[tool_match_idx] == gs_qids[0]:
                    TP += 1
                else:
                    FP += 1    
            
        # In non-strict evaluation, all entity-link pairs in the gold standard are counted equally correct
        else:
            # Check for False Negative (Gold Standard ent does not appear in tool output)
            gs_match_idx, tool_match_idx = find_pair(gs_entities, tool_entities)
            if gs_match_idx == -1:
                FN += 1
    
            # Check for True and False Positives (based on link correctness)
            else:
                if tool_qids[tool_match_idx] == gs_qids[gs_match_idx]:
                    TP += 1
                else:
                    FP += 1        

        # Note: We only evaluate the set of entity-link pairs where the entity is present in the gold standard.
        # Note: An incorrect link is counted the same as a missing one if there is a correct link in the gold standard
            
    
    # Calculate precision and recall
    print(f"TP ={TP}, FP={FP}, FN={FN}")
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # Calculating the F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

**Get Scores**

In [246]:
calculate_precision_recall_f1(gold_df, result_df, 'c5_id', 'mentions', 'qids')

TP =30, FP=228, FN=241


(0.11627906976744186, 0.11070110701107011, 0.11342155009451796)

### Semantic Similarity Eval

In [368]:
def find_next_qid(gs_qids, gs_match_idx):
    ''' If a tool matches a gold entity, but that entity does not have a qid, we do not want to penalize the tool for coming
    up with the next best id. This function returns the first qid in the list after the given entity (getting more general)'''
    original_idx = gs_match_idx
    if type(gs_qids[gs_match_idx]) != str or len(gs_qids[gs_match_idx]) < 1:
        for gs_match_idx in range(gs_match_idx+1, len(gs_qids)):
            if type(gs_qids[gs_match_idx]) != str or len(gs_qids[gs_match_idx]) < 1:
                return gs_match_idx
    return original_idx

In [361]:
def match_gold_pred(gs, df_tool, id_col, ent_col, qid_col):
    
    id = []
    tool_ent = []
    gold_ent = []
    q1_gold = []
    q2_pred = []
    
    for index, gs_row in gs.iterrows():
        gs_id, gs_entities, gs_qids = gs_row['id'], ast.literal_eval(gs_row['entity']), ast.literal_eval(gs_row['qid'])
        tool_entities = [entity.upper() for entity in df_tool.loc[df_tool[id_col] == gs_id, ent_col].tolist()] # get all the entities the tool generated for the gs_id entry
        tool_qids = [qid for qid in df_tool.loc[df_tool[id_col] == gs_id, qid_col].tolist()] # get all the entities the tool generated for the gs_id entry
    
        # Find matching gold standard and output entity-link pair if present
        gs_match_idx, tool_match_idx = find_pair(gs_entities, tool_entities)
        if gs_match_idx == -1:
            continue
    
        # Append to q1 and q2 as appropriate
        next_qid_idx = find_next_qid(gs_qids, gs_match_idx)
    
        for qid_idx in range(min(len(gs_qids),next_qid_idx+1)):
            id.append(gs_id)
            tool_ent.append(gs_entities[gs_match_idx])
            gold_ent.append(gs_entities[qid_idx])
            q1_gold.append(gs_qids[qid_idx])
            q2_pred.append(tool_qids[tool_match_idx])

    return id, tool_ent, gold_ent, q1_gold, q2_pred

In [362]:
def make_temp(q1_gold, q2_pred):
    temp = pd.DataFrame({'q1\tq2':[f"{q1_gold[i]}\t{q2_pred[i]}" for i in range(len(q1_gold))]})
    temp.to_csv('temp.csv',index=False) # create file to feed to call_semantic_similarity()

In [373]:
def retrieve_score_vals(i, col, score_df, eval_df):
    rows = score_df[(score_df['q1'] == eval_df['gold_qid'].iat[i]) & (score_df['q2'] == eval_df['pred_qid'].iat[i])]
    output = list(rows[col])
    if len(output) > 0:
        return output[0]
    else:
        return None

In [380]:
def get_class_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred):

    # organize results
    eval_df = pd.DataFrame({'id':id, 'pred_ent':tool_ent, 'gold_ent':gold_ent,'gold_qid':q1_gold,'pred_qid':q2_pred, 'gold_label':range(len(id)), 'pred_label':range(len(id)),'class':range(len(id)), 'jc':range(len(id))})
    eval_df['gold_label'] = eval_df['gold_label'].apply(retrieve_score_vals, col='q1_label', score_df=score_df, eval_df=eval_df)
    eval_df['pred_label'] = eval_df['pred_label'].apply(retrieve_score_vals, col='q2_label', score_df=score_df, eval_df=eval_df)
    eval_df['class'] = eval_df['class'].apply(retrieve_score_vals, col='class', score_df=score_df, eval_df=eval_df)

    # Get highest scoring gold_qid-pred_qid for each pred_ent

    class_rows_to_keep = []
    
    for id in eval_df['id'].unique():
        for ent in eval_df[eval_df['id']==id]['pred_ent'].unique():
            rows = eval_df[(eval_df['id']==id) & (eval_df['pred_ent']==ent)]
            
            class_scores = [score for score in rows['class'] if score != None and score != ""]
            if len(class_scores) > 0:
                idx = list(rows.index)[list(rows['class']).index(max(class_scores))]
                class_rows_to_keep.append(idx)

    class_score = eval_df.loc[class_rows_to_keep]['class'].dropna().mean()

    return class_score

In [384]:
def get_jc_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred):

    # organize results
    eval_df = pd.DataFrame({'id':id, 'pred_ent':tool_ent, 'gold_ent':gold_ent,'gold_qid':q1_gold,'pred_qid':q2_pred, 'gold_label':range(len(id)), 'pred_label':range(len(id)),'class':range(len(id)), 'jc':range(len(id))})
    eval_df['gold_label'] = eval_df['gold_label'].apply(retrieve_score_vals, col='q1_label', score_df=score_df, eval_df=eval_df)
    eval_df['pred_label'] = eval_df['pred_label'].apply(retrieve_score_vals, col='q2_label', score_df=score_df, eval_df=eval_df)
    eval_df['jc'] = eval_df['jc'].apply(retrieve_score_vals, col='jc', score_df=score_df, eval_df=eval_df)

    # Get highest scoring gold_qid-pred_qid for each pred_ent

    jc_rows_to_keep = []

    for id in eval_df['id'].unique():
        for ent in eval_df[eval_df['id']==id]['pred_ent'].unique():
            rows = eval_df[(eval_df['id']==id) & (eval_df['pred_ent']==ent)]
            
            jc_scores = [score for score in rows['jc'] if score != None and score != ""]
            if len(jc_scores) > 0:
                idx = list(rows.index)[list(rows['jc']).index(max(jc_scores))]
                jc_rows_to_keep.append(idx)

    jc_score = eval_df.loc[jc_rows_to_keep]['jc'].dropna().mean()

    return jc_score

In [382]:
def calculate_class_jc(gold_df, result_df, id_col, ent_col, qid_col, url='https://kgtk.isi.edu/similarity_api'):
    
    # Get all entity-link pair candidates for evaluation
    id, tool_ent, gold_ent, q1_gold, q2_pred = match_gold_pred(gold_df, result_df, id_col,ent_col,qid_col)
    
    # Call API
    make_temp(q1_gold, q2_pred)
    score_df = call_semantic_similarity('temp.csv', url)
    os.remove('temp.csv')
    
    # Get Scores
    class_score = get_class_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred)
    jc_score = get_jc_score(score_df, id, tool_ent, gold_ent, q1_gold, q2_pred)

    return class_score, jc_score

In [385]:
calculate_class_jc(gold_df, result_df, 'c5_id','mentions','qids')

KeyboardInterrupt: 