## Tool - BLINK
---

In [1]:
import os, ast
import pandas as pd

nlp_task = 'nel'
tool_name = 'blink'
tool_result_path = '../../data/results/blink/blink_results.csv'
gs_samples_path = '../../gold_standard/processed/samples.csv'
output_dir = '../../tool_results/'

### Load and standardize the data


In [2]:
# load output from the tool to be evaluated
data = pd.read_csv(tool_result_path)
data

Unnamed: 0,doc_idx,sent_idx,original_sentence,input,mention,bi_pred_entity,bi_qid,bi_desc,cross_pred_entity,cross_qid,cross_desc
0,0,0,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,tailwheel cocked right prior to tkof.,tailwheel,Flywheel,Q183576,A flywheel is a mechanical device specificall...,Conventional landing gear,Q2874355,"Conventional landing gear, or tailwheel-type ..."
1,2,1,"2ND ILS APCH,ACFT'S G/S INOP.LOM TUNED TO WRON...",lom tuned to wrong freq.,lom,"Lom, Norway",Q488395,"Lom is a municipality in Oppland county, Norw...",Herbert Lom,Q165357,Herbert Lom (; 11 September 1917 – 27 Septemb...
2,7,0,MTNS OBSCURED.FLT TO CK VOR REC REPTD INOP PRI...,mtns obscured.,mtns,KCNS,Q6327147,"KCNS, virtual channel 38 (UHF digital channel...",MTN (TV station),Q6718480,MTN is a television station licensed to serve...
3,10,1,LEFT ENG OIL SUPPLY EXHAUSTED.GEAR-UP LDG IN M...,gear-up ldg in mesquite brush.,mesquite,Mesquite,Q3315767,Mesquite is a common name for several plants ...,Mesquite,Q3315767,Mesquite is a common name for several plants ...
4,22,0,APRX 1/2 CUPFULL FLUID UNDER R BRAKE PRIOR TO ...,aprx 1/2 cupfull fluid under r brake prior to ...,aprx,Audi Performance and Racing,Q4819705,APR LLC is an American automotive engineering...,Elementis,Q5358919,Elementis plc is one of the UK's largest spec...
...,...,...,...,...,...,...,...,...,...,...,...
1159,2742,0,"ON 7/22/08 AT 1249 MST, CESSNA T182T, N562GK, ...","on 7/22/08 at 1249 mst, cessna t182t, n562gk, ...",mst,South African Standard Time,Q770321,South African Standard Time (SAST) is the tim...,Marine Science Technician,Q16203277,Marine Science Technician (MST) is an enliste...
1160,2743,0,(-23) A/C RELOCATED TO NEW HANGAR TO CHECK SIZ...,(-23) a/c relocated to new hangar to check siz...,new hangar,American Airways Hangar and Administration Bui...,Q4742968,The American Airways Hangar and Administratio...,Goodyear Airdock,Q3110737,The Goodyear Airdock is a construction and st...
1161,2744,0,(-23) ON 2/23/08 @ APPROXIMATELY 2130 DURING T...,(-23) on 2/23/08 @ approximately 2130 during t...,airc,Air Force Reserve Command,Q407191,The Air Force Reserve Command (AFRC) is a Maj...,Air traffic control,Q221395,Air traffic control (ATC) is a service provid...
1162,2745,0,(-23) PILOT TOOK OFF FOR LEESBURG AIRPORT AND ...,(-23) pilot took off for leesburg airport and ...,leesburg,"Leesburg, Virginia",Q1012089,Leesburg is the county seat of Loudoun County...,"Leesburg, Florida",Q625833,"Leesburg is a city in Lake County, Florida, U..."


In [3]:

# rename columns
data.rename(columns={"c5": "id", "c119": "sample", "c119_entity_linking":"entity_linking"}, inplace=True)

# filter data where 'id' values are in gs_samples 'id' values
gs_samples = pd.read_csv(gs_samples_path)
mask = data['id'].isin(gs_samples['id'])
data = data[mask].copy()

# convert the string representation of a list of dictionaries to actual list of dictionaries
data['entity_linking'] = data['entity_linking'].apply(ast.literal_eval)

# explode the entity_linking column
data = data.explode('entity_linking').reset_index(drop=True)

# Convert the dictionary column to a DataFrame
attributes_df = data['entity_linking'].apply(pd.Series)

# change the column type to object
attributes_df['indentifier'] = attributes_df['indentifier'].astype('Int64')
attributes_df

#attributes_df.drop(columns=[0], inplace=True)

# Concatenate with the original DataFrame (minus the dictionary column)
data = pd.concat([data.drop('entity_linking', axis=1), attributes_df], axis=1)

# rename columns
data.rename(columns={"label": 'entity', "indentifier": "qid"}, inplace=True)

# save the processed tool output to a file
output_path = os.path.join(output_dir, f"{nlp_task}__{tool_name}.csv")
data.to_csv(output_path, index=False)

data

KeyError: 'id'

## Gold standard NEL data

### Load the processed gold standard data

In [3]:
# path to the gold standard file
task_gold_standard_path = f"../../gold_standard/processed/{nlp_task}.csv"

# load output from the tool to be evaluated
gs = pd.read_csv(task_gold_standard_path)

gs

Unnamed: 0,id,sample,entity,qid
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT,11436
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,DITCH,2048319
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,TREE,10884
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,LOST CONTROL,29017603
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",TAKEOFF,854248
...,...,...,...,...
496,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,PILOT ERROR,3057459
497,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,PILOT,2095549
498,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL FILLER CAP,-1
499,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL,42962


In [None]:
id = '19990213001379A'
print(gs['sample'][gs.id==id].unique())
print(data['sample'][data.id==id].unique())



['ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK']
['ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK']


In [None]:
cols = ['id','entity','qid']
gs[cols][gs.id==id]

Unnamed: 0,id,entity,qid
0,19990213001379A,ACFT,11436
1,19990213001379A,DITCH,2048319
2,19990213001379A,TREE,10884
3,19990213001379A,LOST CONTROL,29017603


In [None]:
cols = ['id','entity','qid', 'description']
data[cols][data.id==id]

Unnamed: 0,id,entity,qid,description
14221,19990213001379A,United States Army Combat Fitness Test,67935434,Physical fitness test for the United States Army
14222,19990213001379A,The Take,3989575,television series
14223,19990213001379A,OpenType Font,260180,file format
14224,19990213001379A,Control,54935655,2019 action-adventure video game developed by ...
14225,19990213001379A,Royal Australian Navy,741691,naval warfare branch of the Australian Defence...
14226,19990213001379A,Charlotte Brontë,127332,English novelist and poet
14227,19990213001379A,Adobe After Effects,83380,digital motion graphics and compositing software


### Find partial matches 


Find partial matches between the `entity` values in your `gs` and `data` DataFrames, where the `entity` from `gs` is contained within the `entity` in `data` for the same `id`.


In [None]:
# Filter out rows where 'qid' is '-1'
gs= gs[gs['qid'] != -1]
gs.shape

(396, 4)

In [None]:
# Initialize an empty list to store matching records
matches = []

# Iterate through each row in the gs dataframe
for _, gs_row in gs.iterrows():
    # Filter the data dataframe for rows with the same id
    data_filtered = data[data['id'] == gs_row['id']]
    
    # Check if the gs entity is a substring of any entity in the filtered data rows
    for _, data_row in data_filtered.iterrows():
        if gs_row['entity'].lower() in data_row['entity'].lower():
            # Add the match to the matches list with qid values from both gs and data
            match = {
                'gs_id': gs_row['id'],
                'gs_entity': gs_row['entity'],
                'gs_qid': gs_row['qid'],  # qid from gs
                'data_entity': data_row['entity'],
                'data_qid': int(data_row['qid']),  # qid from data
                'data_description': data_row['description']
            }
            matches.append(match)

# Convert the matches list to a DataFrame
matches_df = pd.DataFrame(matches)

matches_df

Unnamed: 0,gs_id,gs_entity,gs_qid,data_entity,data_qid,data_description
0,19800217031649I,TAKEOFF,854248,Takeoff,48719890,"American rapper, member of the hip hop group M..."
1,19800217031649I,ENGINE,44167,Engine,18353587,
2,19800217031649I,WING,161358,WING,7950776,"commercial AM radio station in Dayton, Ohio"
3,19790720021329A,HELICOPTER,34486,HElicopTEr,5629507,album by Download
4,19790720021329A,TREES,10884,Trees,7837713,poem by Joyce Kilmer
...,...,...,...,...,...,...
204,19960418007829A,DEPARTURE,21171241,Departure,1922930,album by Journey
205,19870523018729A,PILOT ERROR,3057459,pilot error,3057459,"decision, action or inaction by a pilot of an ..."
206,20030620012809I,PILOT,2095549,PILOT,2044212,historic programming language
207,20030620012809I,OIL,42962,Oil,7081283,episode of The Young Ones


In [None]:
# Count True Positives where gs_qid matches data_qid
TP = sum(matches_df['gs_qid'] == matches_df['data_qid'])

# Output the count of True Positives
print(f"True Positives: {TP}")

True Positives: 28
