## Tool - spaCy EntityLinker
---

In [8]:
import os, ast
import pandas as pd

nlp_task = 'nel'
tool_name = 'spacy_entity_linker'
tool_result_path = '../../data/results/spacy_entity_linker/FAA_DataModel_20240104104402.csv'
gs_samples_path = '../../gold_standard/processed/samples.csv'
output_dir = '../../tool_results/'

### Load and standardize the output data


In [9]:
# load output from the tool to be evaluated
data = pd.read_csv(tool_result_path)

# rename columns
data.rename(columns={"c5": "id", "c119": "sample", "c119_entity_linking":"entity_linking"}, inplace=True)

# filter data where 'id' values are in gs_samples 'id' values
gs_samples = pd.read_csv(gs_samples_path)
mask = data['id'].isin(gs_samples['id'])
data = data[mask].copy()

# convert the string representation of a list of dictionaries to actual list of dictionaries
data['entity_linking'] = data['entity_linking'].apply(ast.literal_eval)

# explode the entity_linking column
data = data.explode('entity_linking').reset_index(drop=True)

# Convert the dictionary column to a DataFrame
attributes_df = data['entity_linking'].apply(pd.Series)

# change the column type to object
attributes_df['indentifier'] = attributes_df['indentifier'].astype('Int64')
attributes_df

#attributes_df.drop(columns=[0], inplace=True)

# Concatenate with the original DataFrame (minus the dictionary column)
data = pd.concat([data.drop('entity_linking', axis=1), attributes_df], axis=1)

# rename columns
data.rename(columns={"label": 'entity', "indentifier": "qid"}, inplace=True)

# save the processed tool output to a file
output_path = os.path.join(output_dir, f"{nlp_task}__{tool_name}.csv")
data.to_csv(output_path, index=False)

data

Unnamed: 0,id,sample,qid,entity,description
0,19760606015529A,SUFFICIENT OPPORTUNITY EXISTED TO RELEASE WHEN...,193538,Opportunity,NASA Mars rover
1,19760606015529A,SUFFICIENT OPPORTUNITY EXISTED TO RELEASE WHEN...,3785514,High,song by Lighthouse Family
2,19780111000459A,ACFT DISPATCHER HARRASSMENT OF PILOT. PILOT FO...,67935434,United States Army Combat Fitness Test,Physical fitness test for the United States Army
3,19780111000459A,ACFT DISPATCHER HARRASSMENT OF PILOT. PILOT FO...,2044212,PILOT,historic programming language
4,19780402008409I,TAXIING AIRCRAFT STRUCK PARKED AIRCRAFT. FOUND...,4698564,Aircraft,"railway station in Laverton, Melbourne, Victor..."
...,...,...,...,...,...
597,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,4698564,Aircraft,"railway station in Laverton, Melbourne, Victor..."
598,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,975490,Into,album
599,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,4126730,The Climb,2002 film by John Schmidt
600,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,743004,aircraft engine,engine designed for use in powered aircraft


## Gold standard NEL data

### Load the processed gold standard data

In [10]:
# path to the gold standard file
task_gold_standard_path = f"../../gold_standard/processed/{nlp_task}.csv"

# load output from the tool to be evaluated
gs = pd.read_csv(task_gold_standard_path)

### See the answer for a specific sample/record

In [11]:
id = '19940226003029A'

In [12]:
print(gs['sample'][gs.id==id].unique())
cols = ['id','entity','qid']
gs[cols][gs.id==id]


['LOST POWER EN ROUTE. LANDED ON PRIVATE STRIP. NOSED OVER. LEFT FUEL CAP LEAKED. PREVENTED GRAVITY FUEL FLOW.       ']


Unnamed: 0,id,entity,qid
423,19940226003029A,POWER,25342
424,19940226003029A,STRIP,184590
425,19940226003029A,FUEL CAP,123205535
426,19940226003029A,FUEL,1875633
427,19940226003029A,Nosed Over,16661458


In [13]:
print(data['sample'][data.id==id].unique())
cols = ['id','entity','qid', 'description']
data[cols][data.id==id]

['LOST POWER EN ROUTE. LANDED ON PRIVATE STRIP. NOSED OVER. LEFT FUEL CAP LEAKED. PREVENTED GRAVITY FUEL FLOW.       ']


Unnamed: 0,id,entity,qid,description
303,19940226003029A,En Route,5375006,album by Moebius & Plank
304,19940226003029A,Landed,1842401,album by Can
305,19940226003029A,Ontario,1904,province of Canada
306,19940226003029A,Strip,4050244,album by Adam Ant
307,19940226003029A,Over,20020605,episode of Breaking Bad (S2 E10)
308,19940226003029A,Common Agricultural Policy,220687,
309,19940226003029A,FUEL,5427266,


In [23]:
matches_df[matches_df.gs_id==id]

Unnamed: 0,gs_id,gs_entity,gs_qid,data_entity,data_qid,data_description
201,19940226003029A,STRIP,184590,Strip,4050244,album by Adam Ant
202,19940226003029A,FUEL,1875633,FUEL,5427266,


### Check the tool output against the gold standard data 

In [7]:
print("\nBefore filtering out rows where 'qid' is '-1'")
print(f"Size tool answer {len(data)}, size GS {len(gs)}")

# Filter out rows where 'qid' is '-1'
gs_f= gs[gs['qid'] != -1]
gs_f.shape

print("\nAfter filtering out rows where 'qid' is '-1'")
print(f"Size tool answer {len(data)}, size GS {len(gs_f)}")


Before filtering out rows where 'qid' is '-1'
Size tool answer 602, size GS 498

After filtering out rows where 'qid' is '-1'
Size tool answer 602, size GS 443


### Find partial matches 


Find partial matches between the `entity` values in your `gs` and `data` DataFrames, where the `entity` from `gs` is contained within the `entity` in `data` for the same `id`.


In [14]:
def find_matches(data, gs):    
    matches = [] # Initialize an empty list to store matching records

    # Iterate through each row in the gs dataframe
    for _, gs_row in gs.iterrows():
        # Filter the data dataframe for rows with the same id
        data_filtered = data[data['id'] == gs_row['id']]
        
        # Check if the gs entity is a substring of any entity in the filtered data rows
        for _, data_row in data_filtered.iterrows():
            if gs_row['entity'].lower() in data_row['entity'].lower():
                # Add the match to the matches list with qid values from both gs and data
                match = {
                    'gs_id': gs_row['id'],
                    'gs_entity': gs_row['entity'],
                    'gs_qid': gs_row['qid'],  # qid from gs
                    'data_entity': data_row['entity'],
                    'data_qid': int(data_row['qid']),  # qid from data
                    'data_description': data_row['description']
                }
                matches.append(match)

    # Convert the matches list to a DataFrame
    matches_df = pd.DataFrame(matches)

    return matches_df

In [15]:
matches_df = find_matches(data, gs)
matches_df.to_csv(f"{output_dir}/{nlp_task}__{tool_name}_matches_all.csv", index=False)

In [16]:
matches_df

Unnamed: 0,gs_id,gs_entity,gs_qid,data_entity,data_qid,data_description
0,19800217031649I,TAKEOFF,854248,Takeoff,48719890,"American rapper, member of the hip hop group M..."
1,19800217031649I,ENGINE,44167,Engine,18353587,
2,19800217031649I,WING,161358,WING,7950776,"commercial AM radio station in Dayton, Ohio"
3,19790720021329A,HELICOPTER,34486,HElicopTEr,5629507,album by Download
4,19790720021329A,TREES,10884,Trees,7837713,poem by Joyce Kilmer
...,...,...,...,...,...,...
217,19960418007829A,DEPARTURE,21171241,Departure,1922930,album by Journey
218,19870523018729A,PILOT ERROR,3057459,pilot error,3057459,"decision, action or inaction by a pilot of an ..."
219,20030620012809I,PILOT,2095549,PILOT,2044212,historic programming language
220,20030620012809I,OIL,42962,Oil,7081283,episode of The Young Ones


In [20]:
matches_df = find_matches(data, gs_f)
matches_df 

Unnamed: 0,gs_id,gs_entity,gs_qid,data_entity,data_qid,data_description
0,19800217031649I,TAKEOFF,854248,Takeoff,48719890,"American rapper, member of the hip hop group M..."
1,19800217031649I,ENGINE,44167,Engine,18353587,
2,19800217031649I,WING,161358,WING,7950776,"commercial AM radio station in Dayton, Ohio"
3,19790720021329A,HELICOPTER,34486,HElicopTEr,5629507,album by Download
4,19790720021329A,TREES,10884,Trees,7837713,poem by Joyce Kilmer
...,...,...,...,...,...,...
217,19960418007829A,DEPARTURE,21171241,Departure,1922930,album by Journey
218,19870523018729A,PILOT ERROR,3057459,pilot error,3057459,"decision, action or inaction by a pilot of an ..."
219,20030620012809I,PILOT,2095549,PILOT,2044212,historic programming language
220,20030620012809I,OIL,42962,Oil,7081283,episode of The Young Ones


In [18]:
# Count True Positives where gs_qid matches data_qid
TP = sum(matches_df['gs_qid'] == matches_df['data_qid'])

# Output the count of True Positives
print(f"True Positives: {TP}")

True Positives: 28


In [19]:
import os
import requests
import json
import pandas as pd

def call_semantic_similarity(input_file, url):
    file_name = os.path.basename(input_file)
    files = {
        'file': (file_name, open(input_file, mode='rb'), 'application/octet-stream')
    }
    resp = requests.post(url, files=files, params={'similarity_types': 'all'})
    s = json.loads(resp.json())
    return pd.DataFrame(s)

def get_wikidata_description(qid):
    # Endpoint URL for the Wikidata Query Service
    endpoint_url = "https://query.wikidata.org/sparql"
    
    # SPARQL query to get the description of an item by its QID
    query = f"""
    SELECT ?itemDescription WHERE {{
        wd:{qid} schema:description ?itemDescription.
        FILTER(LANG(?itemDescription) = "en")
    }}
    """
    
    # The headers to indicate that the response should be in JSON format
    headers = {
        "Accept": "application/sparql-results+json"
    }
    
    # Make the request to the Wikidata Query Service
    response = requests.get(endpoint_url, headers=headers, params={'query': query})
    
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        descriptions = data.get("results", {}).get("bindings", [])
        if descriptions:
            # Return the description text
            return descriptions[0]["itemDescription"]["value"]
        else:
            return "No description found."
    else:
        return "Failed to fetch data."

# Example usage
# Replace 'QID' with the actual QID you want to query, for example, 'Q42' for Douglas Adams
# print(get_wikidata_description('QID'))

In [19]:

SIM_API = 'https://kgtk.isi.edu/similarity_api'

df = call_semantic_similarity('test_file.csv', SIM_API)

df['q2_description'] = df['q2'].apply(get_wikidata_description)

#df.to_csv('test_file_similarity.tsv', index=False, sep='\t')
df[['q1', 'q2', 'q1_label', 'q2_label', 'class',  'jc', 'q2_description']]

Unnamed: 0,q1,q2,q1_label,q2_label,class,jc,q2_description
0,Q1875633,Q1875633,aviation fuel,aviation fuel,1.0,1.0,propellents used to power aircraft or aviation...
1,Q1875633,Q42501,aviation fuel,combustible matter,0.684539,0.885428,any material that stores energy that can later...
2,Q1875633,Q15766923,aviation fuel,Fuel,0.029833,0.062413,scientific journal
3,Q1875633,Q5507117,aviation fuel,Fuel,0.0,0.0,short-lived Bay Area post-hardcore musical act
4,Q1875633,Q35120,aviation fuel,entity,0.003065,0.042554,"anything that can be considered, discussed, or..."
