### Reformat ReFinED

This notebook take the output from refined_faa.py and processes it into a form easily acceptable to evaluation scripts. The output is stored in data/results/refined

In [2]:
import pandas as pd
import re

In [3]:
result_df = pd.read_csv('FAA_DataModel_20240610160902.csv')
result_df.head()

Unnamed: 0,c5,c119,c119_entity_linking
0,19750315005389A,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,"[['TAILWHEEL COCKED', Entity not linked to a k..."
1,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[['PLANE BECAME', Entity not linked to a knowl..."
2,19751029037799A,"2ND ILS APCH,ACFT'S G/S INOP.LOM TUNED TO WRON...","[['2ND', None, 'ORDINAL'], ['ILS APCH', Entity..."
3,19751209037899A,PLT NOTED SOFT R BRAKE PEDAL DRG TAXI TO TKOF....,"[['KE', Entity not linked to a knowledge base,..."
4,19750818025579A,TAXI OFF HARD SFC DUE TFC R MAIN GR BROKE THRO...,"[['TAXI', Entity(wikidata_entity_id=Q82650, wi..."


In [4]:
result_df['c119_entity_linking'].iat[0] # example

"[['TAILWHEEL COCKED', Entity not linked to a knowledge base, None], ['TKOF', Entity(wikidata_entity_id=Q7690028, wikipedia_entity_title=Taylor knock-out factor), None]]"

**Extract entities and links from c119_entity_linking**

In [6]:
out_dict = {'c5_id':[],'c119_input':[],'c119_entity_linking':[], 'mentions':[],'labels':[],'entities':[],'ids':[]}
values_p = re.compile("\[?\['([^']+)', (Entity not linked to a knowledge base|Entity\([^\)]+\)), (None|[A-Z]+)\],? ?(.*)") # returns groups ent, linked_ent, label, rest
id_title_p = re.compile('Entity\(wikidata_entity_id=(Q[0-9]+), wikipedia_entity_title=([^\)]+)\)') # returns Qid, Wikipedia title

for i in range(len(result_df)):
    
    text = result_df['c119_entity_linking'].iat[i]
    while text:
        
        mo = re.match(values_p, text)
    
        if mo:
            ent, linked_ent, label, text = mo.groups()
    
            # Put empty values where there is no data
            # Extract QID and title from linked_ent
            if linked_ent == "Entity not linked to a knowledge base":
                id = ""
                title = ""
            else:
                id, title = re.match(id_title_p, linked_ent).groups()
            if label == "None":
                label = ""
    
            out_dict['c5_id'].append(result_df['c5'].iat[i])
            out_dict['c119_input'].append(result_df['c119'].iat[i])
            out_dict['c119_entity_linking'].append(result_df['c119_entity_linking'].iat[i])
            out_dict['mentions'].append(ent)
            out_dict['labels'].append(label)
            out_dict['entities'].append(title)
            out_dict['ids'].append(id)
    
        else:
            text = None

In [8]:
out_df = pd.DataFrame(out_dict)
out_df.head()

Unnamed: 0,c5_id,c119_input,c119_entity_linking,mentions,labels,entities,ids
0,19750315005389A,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,"[['TAILWHEEL COCKED', Entity not linked to a k...",TAILWHEEL COCKED,,,
1,19750315005389A,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,"[['TAILWHEEL COCKED', Entity not linked to a k...",TKOF,,Taylor knock-out factor,Q7690028
2,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[['PLANE BECAME', Entity not linked to a knowl...",PLANE BECAME,,,
3,19750419011349A,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,"[['PLANE BECAME', Entity not linked to a knowl...",ORNE,,,
4,19751209037899A,PLT NOTED SOFT R BRAKE PEDAL DRG TAXI TO TKOF....,"[['KE', Entity not linked to a knowledge base,...",KE,,,


**Save to output DataFrame**

In [33]:
out_df.to_csv('../../tool_results/refined/refined.csv')