## Tool - spaCy EntityLinker
---

In [1]:
import os, ast
import pandas as pd

nlp_task = 'nel'
tool_name = 'spacy_entity_linker'
tool_result_path = '../../data/results/spacy_entity_linker/FAA_DataModel_20240104104402.csv'
gs_samples_path = '../../gold_standard/processed/samples.csv'
output_dir = '../../tool_results/'

### Load and standardize the output data


In [2]:
# load output from the tool to be evaluated
data = pd.read_csv(tool_result_path)

# rename columns
data.rename(columns={"c5": "id", "c119": "sample", "c119_entity_linking":"entity_linking"}, inplace=True)

# filter data where 'id' values are in gs_samples 'id' values
gs_samples = pd.read_csv(gs_samples_path)
mask = data['id'].isin(gs_samples['id'])
data = data[mask].copy()

# convert the string representation of a list of dictionaries to actual list of dictionaries
data['entity_linking'] = data['entity_linking'].apply(ast.literal_eval)

# explode the entity_linking column
data = data.explode('entity_linking').reset_index(drop=True)

# Convert the dictionary column to a DataFrame
attributes_df = data['entity_linking'].apply(pd.Series)

# change the column type to object
attributes_df['indentifier'] = attributes_df['indentifier'].astype('Int64')
attributes_df

#attributes_df.drop(columns=[0], inplace=True)

# Concatenate with the original DataFrame (minus the dictionary column)
data = pd.concat([data.drop('entity_linking', axis=1), attributes_df], axis=1)

# rename columns
data.rename(columns={"label": 'entity', "indentifier": "qid"}, inplace=True)

# save the processed tool output to a file
output_path = os.path.join(output_dir, f"{nlp_task}__{tool_name}.csv")
data.to_csv(output_path, index=False)

data

Unnamed: 0,id,sample,qid,entity,description
0,19760606015529A,SUFFICIENT OPPORTUNITY EXISTED TO RELEASE WHEN...,193538,Opportunity,NASA Mars rover
1,19760606015529A,SUFFICIENT OPPORTUNITY EXISTED TO RELEASE WHEN...,3785514,High,song by Lighthouse Family
2,19780111000459A,ACFT DISPATCHER HARRASSMENT OF PILOT. PILOT FO...,67935434,United States Army Combat Fitness Test,Physical fitness test for the United States Army
3,19780111000459A,ACFT DISPATCHER HARRASSMENT OF PILOT. PILOT FO...,2044212,PILOT,historic programming language
4,19780402008409I,TAXIING AIRCRAFT STRUCK PARKED AIRCRAFT. FOUND...,4698564,Aircraft,"railway station in Laverton, Melbourne, Victor..."
...,...,...,...,...,...
597,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,4698564,Aircraft,"railway station in Laverton, Melbourne, Victor..."
598,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,975490,Into,album
599,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,4126730,The Climb,2002 film by John Schmidt
600,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,743004,aircraft engine,engine designed for use in powered aircraft


## Gold standard NEL data

### Load the processed gold standard data

In [3]:
# path to the gold standard file
task_gold_standard_path = f"../../gold_standard/processed/{nlp_task}.csv"

# load output from the tool to be evaluated
gs = pd.read_csv(task_gold_standard_path)

In [4]:
id = '19990213001379A'
print(gs['sample'][gs.id==id].unique())
print(data['sample'][data.id==id].unique())



['ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK']
['ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK']


In [5]:
cols = ['id','entity','qid']
gs[cols][gs.id==id]

Unnamed: 0,id,entity,qid
0,19990213001379A,ACFT,11436
1,19990213001379A,DITCH,2048319
2,19990213001379A,TREE,10884
3,19990213001379A,LOST CONTROL,29017603


In [6]:
cols = ['id','entity','qid', 'description']
data[cols][data.id==id]

Unnamed: 0,id,entity,qid,description
473,19990213001379A,United States Army Combat Fitness Test,67935434,Physical fitness test for the United States Army
474,19990213001379A,The Take,3989575,television series
475,19990213001379A,OpenType Font,260180,file format
476,19990213001379A,Control,54935655,2019 action-adventure video game developed by ...
477,19990213001379A,Royal Australian Navy,741691,naval warfare branch of the Australian Defence...
478,19990213001379A,Charlotte Brontë,127332,English novelist and poet
479,19990213001379A,Adobe After Effects,83380,digital motion graphics and compositing software


### Find partial matches 


Find partial matches between the `entity` values in your `gs` and `data` DataFrames, where the `entity` from `gs` is contained within the `entity` in `data` for the same `id`.


In [7]:
# Filter out rows where 'qid' is '-1'
gs= gs[gs['qid'] != -1]
gs.shape

(396, 4)

In [8]:
# Initialize an empty list to store matching records
matches = []

# Iterate through each row in the gs dataframe
for _, gs_row in gs.iterrows():
    # Filter the data dataframe for rows with the same id
    data_filtered = data[data['id'] == gs_row['id']]
    
    # Check if the gs entity is a substring of any entity in the filtered data rows
    for _, data_row in data_filtered.iterrows():
        if gs_row['entity'].lower() in data_row['entity'].lower():
            # Add the match to the matches list with qid values from both gs and data
            match = {
                'gs_id': gs_row['id'],
                'gs_entity': gs_row['entity'],
                'gs_qid': gs_row['qid'],  # qid from gs
                'data_entity': data_row['entity'],
                'data_qid': int(data_row['qid']),  # qid from data
                'data_description': data_row['description']
            }
            matches.append(match)

# Convert the matches list to a DataFrame
matches_df = pd.DataFrame(matches)

matches_df

Unnamed: 0,gs_id,gs_entity,gs_qid,data_entity,data_qid,data_description
0,19800217031649I,TAKEOFF,854248,Takeoff,48719890,"American rapper, member of the hip hop group M..."
1,19800217031649I,ENGINE,44167,Engine,18353587,
2,19800217031649I,WING,161358,WING,7950776,"commercial AM radio station in Dayton, Ohio"
3,19790720021329A,HELICOPTER,34486,HElicopTEr,5629507,album by Download
4,19790720021329A,TREES,10884,Trees,7837713,poem by Joyce Kilmer
...,...,...,...,...,...,...
204,19960418007829A,DEPARTURE,21171241,Departure,1922930,album by Journey
205,19870523018729A,PILOT ERROR,3057459,pilot error,3057459,"decision, action or inaction by a pilot of an ..."
206,20030620012809I,PILOT,2095549,PILOT,2044212,historic programming language
207,20030620012809I,OIL,42962,Oil,7081283,episode of The Young Ones


In [9]:
# Count True Positives where gs_qid matches data_qid
TP = sum(matches_df['gs_qid'] == matches_df['data_qid'])

# Output the count of True Positives
print(f"True Positives: {TP}")

True Positives: 28
