# Prepare and standardize the gold standard data

From ``../gold_standard/raw/*.cvs`` to ``../gold_standard/procesed/*.csv``

## Gold standard NER data

Un-typed NER GS:

In [37]:
import pandas as pd
import ast

gold_standard_path = 'raw/ner.csv'
data = pd.read_csv(gold_standard_path)


# rename columns
columns={"c5_unique_id": "id", "c119_text": "sample", "GS":"entities"}
data.rename(columns=columns, inplace=True)

data = data[['id', 'sample', 'entities']]

# Remove spaces within entities, split by comma, then explode
data['entities'] = data['entities'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
data = data[['id','sample','entities']].explode('entities').reset_index(drop=True)

# check for errors -- entities which don't match literal text mentions
for sample, entity in zip(data['sample'],data['entities']):
        if entity != None and entity not in sample:
            print(sample, entity)

# Save the processed tool output to a file
data.to_csv('processed/ner.csv', index=False)
data

Unnamed: 0,id,sample,entities
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT
1,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",TAKEOFF
2,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",ENGINE
3,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",WING
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",FUEL TANK
...,...,...,...
504,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,PILOT
505,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL FILLER CAP
506,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL
507,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL


### Benchmark-Annotated Gold Standard NER Data

In [38]:
import pandas as pd
import ast

gold_standard_path = 'raw/ner_benchmarks_gold.csv'
original_data = pd.read_csv(gold_standard_path, skiprows=8, header=0)

for bench in ['conll','ace','on']:

    columns={"c5_unique_id": "id", "c119_text": "sample", bench+"_ents":"entities",bench+"_labels":"labels"}
    data = original_data.rename(columns=columns)
    
    data = data[['id', 'sample', 'entities','labels']]
    
    # Get lists of entities and labels from strings
    data['entities'] = data['entities'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
    data['labels'] = data['labels'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
    
    # Explode
    data = pd.concat([data[['id','sample','entities']].explode('entities').reset_index(drop=True), data['labels'].explode('labels').reset_index(drop=True)], axis=1)

    # check for errors -- entities which don't match literal text mentions
    for sample, entity in zip(data['sample'],data['entities']):
            if type(entity) == str and entity not in sample:
                print(sample, entity)
    
    #save
    data.to_csv(f'processed/ner_{bench}.csv', index=False)

In [39]:
# Make ACE-Phase 1 GS with restricted set of labels used by NLTK

gold_standard_path = 'raw/ner_benchmarks_gold.csv'
original_data = pd.read_csv(gold_standard_path, skiprows=8, header=0)

bench = 'ace'

columns={"c5_unique_id": "id", "c119_text": "sample", bench+"_ents":"entities",bench+"_labels":"labels"}
data = original_data.rename(columns=columns)

data = data[['id', 'sample', 'entities','labels']]

# Get lists of entities and labels from strings
data['entities'] = data['entities'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
data['labels'] = data['labels'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))

# Remove "VEHICLE" entities and labels
for i in range(len(data)):
    while "VEHICLE" in data['labels'].iat[i]:
        vehicle_idx = data['labels'].iat[i].index("VEHICLE")
        data['entities'].iat[i] = data['entities'].iat[i][:vehicle_idx] + data['entities'].iat[i][vehicle_idx+1:]
        data['labels'].iat[i] = data['labels'].iat[i][:vehicle_idx] + data['labels'].iat[i][vehicle_idx+1:]

# Explode
data = pd.concat([data[['id','sample','entities']].explode('entities').reset_index(drop=True), data['labels'].explode('labels').reset_index(drop=True)], axis=1)

# check for errors -- entities which don't match literal text mentions
for sample, entity in zip(data['sample'],data['entities']):
        if type(entity) == str and entity not in sample:
            print(sample, entity)

#save
data.to_csv(f'processed/ner_ace_nltk.csv', index=False)

## Gold Standard CR data

In [43]:
import pandas as pd

gold_standard_path = 'raw/cr.csv'
data = pd.read_csv(gold_standard_path)

data.rename(columns={"c5": "id", "c119_text": "sample",'coreferences':'coreferences'}, inplace=True)

# Save the processed tool output to a file
data[['id','sample','coreferences']].to_csv('../gold_standard/processed/cr.csv', index=False)
data[['id','sample','coreferences']]

Unnamed: 0,id,sample,coreferences
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[[[0,0],[7,7]]]"
1,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[[[6,9],[16,16]]]"
2,19790720021329A,HELICOPTER TOOK OFF WITH SLING LOAD ATTACHED. ...,"[[[4,5],[10]]]"
3,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...,[]
4,19860128014289I,FORWARD CARGO DOOR OPENED AS AIRCRAFT TOOK OFF...,[]
...,...,...,...
95,19880527016939A,ENGINE QUIT ON INITIAL CLIMBOUT. CRASH LANDED ...,[]
96,19960418007829A,CRASHED AND BURNED. (.4)WITNESSES REPORTED THA...,[]
97,19970828026989A,LOUD POP ON TAKEOFF ROLL. LOST RUDDER CONTROL....,[]
98,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,[]


## Gold standard NEL data

In [46]:
import math
import ast
import pandas as pd

gold_standard_path = 'raw/nel.csv'
data = pd.read_csv(gold_standard_path)

# rename columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample"}, inplace=True)
data['entity'] = range(len(data))
data['qid'] = range(len(data))

# compile primary, secondary, and tertiary entities and qids into parallel lists
prefixes = ['primary_','secondary_','tertiary_']
data['entity'] = data['entity'].apply(lambda i: [data[prefix+'ent'].iat[i] if type(data[prefix+'ent'].iat[i])==str else None for prefix in prefixes])
data['qid'] = data['qid'].apply(lambda i: [data[prefix+'qid'].iat[i] if type(data[prefix+'qid'].iat[i])==str else None for prefix in prefixes])

# check for errors -- entities which don't match literal text mentions
for sample, entity in zip(data['sample'],data['entity']):
    for ent in entity:
        if ent != None and ent not in sample:
            print(sample, ent)

# Save the processed tool output to a file
data[['id','sample','entity','qid']].to_csv('../gold_standard/processed/nel.csv', index=False)
data[['id','sample','entity','qid']]

Unnamed: 0,id,sample,entity,qid
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[ACFT, None, None]","[Q11436, None, None]"
1,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[TAKEOFF, None, None]","[Q854248, None, None]"
2,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[ENGINE, None, None]","[Q743004, None, None]"
3,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[WING, None, None]","[Q161358, None, None]"
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[FUEL TANK, TANK, None]","[Q1411232, Q1047832, None]"
...,...,...,...,...
505,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[PILOT, None, None]","[Q2095549, None, None]"
506,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[OIL FILLER CAP, CAP, None]","[None, Q2488579, None]"
507,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[OIL, None, None]","[Q42962, None, None]"
508,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[OIL, None, None]","[Q42962, None, None]"
