# Prepare and standardize the gold standard data

From ``../gold_standard/raw/*.cvs`` to ``../gold_standard/procesed/*.csv``

## Gold standard samples
---

In [1]:
import pandas as pd

gold_standard_path = '../gold_standard/raw/samples.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# rename columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample"}, inplace=True)

# save the processed tool output to a file
data.to_csv('../gold_standard/processed/samples.csv', index=False)

data

Unnamed: 0,id,sample
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...
1,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM..."
2,19790720021329A,HELICOPTER TOOK OFF WITH SLING LOAD ATTACHED. ...
3,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...
4,19860128014289I,FORWARD CARGO DOOR OPENED AS AIRCRAFT TOOK OFF...
...,...,...
95,19880527016939A,ENGINE QUIT ON INITIAL CLIMBOUT. CRASH LANDED ...
96,19960418007829A,CRASHED AND BURNED. (.4)WITNESSES REPORTED THA...
97,19970828026989A,LOUD POP ON TAKEOFF ROLL. LOST RUDDER CONTROL....
98,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...


## Gold standard NEL data

Code updated for Flexible GS

In [4]:
import math

gold_standard_path = '../gold_standard/raw/nel.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# get rid of rows with a None QID for the primary ent
#data = data[data['primary_qids'].apply(lambda x: type(x) == str)]
#data = data.reindex(range(len(data)))

# rename columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample"}, inplace=True)
data['entity'] = range(len(data))
data['qid'] = range(len(data))

# compile primary, secondary, and tertiary entities and qids into parallel lists
prefixes = ['primary_','secondary_','tertiary_']
data['entity'] = data['entity'].apply(lambda i: [data[prefix+'ents'].iat[i] if type(data[prefix+'ents'].iat[i])==str else None for prefix in prefixes])
data['qid'] = data['qid'].apply(lambda i: [data[prefix+'qids'].iat[i] if type(data[prefix+'qids'].iat[i])==str else None for prefix in prefixes])

# Save the processed tool output to a file
data[['id','sample','entity','qid']].to_csv('../gold_standard/processed/nel.csv', index=False)
data[['id','sample','entity','qid']]

Unnamed: 0,id,sample,entity,qid
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[ACFT, None, None]","[Q11436, None, None]"
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[DITCH, None, None]","[Q2048319, None, None]"
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[TREE, None, None]","[Q10884, None, None]"
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,"[LOST CONTROL, None, None]","[None, None, None]"
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...","[TAKEOFF, None, None]","[Q854248, None, None]"
...,...,...,...,...
494,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,"[PILOT ERROR, ERROR, None]","[Q3057459, Q29485, None]"
495,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[PILOT, None, None]","[Q2095549, None, None]"
496,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[OIL FILLER CAP, CAP, None]","[None, Q6147804, None]"
497,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,"[OIL, None, None]","[Q42962, None, None]"


## Gold standard RE data

DNE!

In [9]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

gold_standard_path = '../gold_standard/raw/re.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# get the columns of interest
data = data[['c5_unique_id', 'c119_text', 'subject(Label), relation_type(Label), object(Label) ']]

# rename the columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample", 'subject(Label), relation_type(Label), object(Label) ':"triple"}, inplace=True)

# Explode the 'triple' column on '\n'
data = data.assign(triple=data['triple'].str.split('\n')).explode('triple')

# Splitting the 'triple' column into 'subject', 'relation', and 'object' and their respective types
split_columns = data['triple'].str.extract('(.*) \((.*)\), (.*) \((.*)\), (.*) \((.*)\)')
data[['subject', 'subject_type', 'relation', 'relation_type', 'object', 'object_type']] = split_columns

# Show the modified DataFrame
data = data[['id', 'subject', 'subject_type', 'relation', 'relation_type', 'object', 'object_type']]

# Save the processed tool output to a file
data.to_csv('../gold_standard/processed/re.csv', index=False)

data

Unnamed: 0,id,subject,subject_type,relation,relation_type,object,object_type
0,19990213001379A,Aircraft,Object/Subject,was performing,Activity/Operation,Taxiing for Take Off,Activity/Operation
0,19990213001379A,Aircraft,Object/Subject,lost,Negative Outcome,Control,Incident Description
0,19990213001379A,Aircraft,Object/Subject,ran into,Collision,Ditch,Object/Obstacle
0,19990213001379A,Aircraft,Object/Subject,struck,Collision,Tree,Object/Obstacle
0,19990213001379A,Other Circumstances,Contextual Information,status,Condition,Unknown,Status/Detail Type
1,19800217031649I,Engine,Component,quit after,Temporal,Takeoff,Phase/Operation
1,19800217031649I,Wing Fuel Tank Sumps,Component,were not drained during,Condition/Action,Preflight,Phase/Operation
1,19800217031649I,Wing Fuel Tank Sumps,Component,condition,Condition,Frozen,Condition
1,19800217031649I,Drained,Action,prevented by,Causal,Frozen,Condition
2,19790720021329A,Helicopter,Aircraft type,took off with,Action,Sling Load,Cargo


## Gold standard NER data

Un-typed NER GS:

In [16]:
import pandas as pd

gold_standard_path = '../gold_standard/raw/ner.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)


# rename columns
columns={"c5_unique_id": "id", "c119_text": "sample", "Combined":"entities"}
data.rename(columns=columns, inplace=True)

data = data[['id', 'sample', 'entities']]

# Remove spaces within entities, split by comma, then explode
data['entities'] = data['entities'].str.replace(', ', ',').str.split(',')
data = data.explode('entities').reset_index(drop=True)

# Save the processed tool output to a file
data.to_csv('../gold_standard/processed/ner.csv', index=False)
data


Unnamed: 0,id,sample,entities
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,DITCH
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,TREE
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,LOST CONTROL
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",TAKEOFF
...,...,...,...
501,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,PILOT ERROR
502,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,PILOT
503,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL FILLER CAP
504,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL


### Benchmark Gold Standard NER Data

In [1]:
import pandas as pd
import ast

gold_standard_path = '../gold_standard/raw/ner_benchmarks_gold.csv'

# load output from the tool to be evaluated
original_data = pd.read_csv(gold_standard_path, skiprows=8, header=0)
original_data.loc[17, 'ace_ents'] = '["GROUND STAND","GROUND STAND BAGGAGE CART WITH INOPERATIVE BRAKES","AIRCRAFT","FUSELAGE"]'
original_data.loc[44, 'ace_labels'] = '["VEHICLE","PER","PER"]'

for bench in ['conll','ace','on']:

    columns={"c5_unique_id": "id", "c119_text": "sample", bench+"_ents":"entities",bench+"_labels":"labels"}
    data = original_data.rename(columns=columns)
    
    data = data[['id', 'sample', 'entities','labels']]
    
    # Get lists of entities and labels from strings
    data['entities'] = data['entities'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
    data['labels'] = data['labels'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
    
    # Explode
    data = pd.concat([data[['id','sample','entities']].explode('entities').reset_index(drop=True), data['labels'].explode('labels').reset_index(drop=True)], axis=1)

    #save
    data.to_csv(f'../gold_standard/processed/ner_{bench}.csv', index=False)

In [2]:
# Make ACE-2005 GS with restricted set of labels used by NLTK

gold_standard_path = '../gold_standard/raw/ner_benchmarks_gold.csv'

# load output from the tool to be evaluated
original_data = pd.read_csv(gold_standard_path, skiprows=8, header=0)
original_data.loc[17, 'ace_ents'] = '["GROUND STAND","GROUND STAND BAGGAGE CART WITH INOPERATIVE BRAKES","AIRCRAFT","FUSELAGE"]'
original_data.loc[44, 'ace_labels'] = '["VEHICLE","PER","PER"]'

bench = 'ace'

columns={"c5_unique_id": "id", "c119_text": "sample", bench+"_ents":"entities",bench+"_labels":"labels"}
data = original_data.rename(columns=columns)

data = data[['id', 'sample', 'entities','labels']]

# Get lists of entities and labels from strings
data['entities'] = data['entities'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))
data['labels'] = data['labels'].apply(lambda x: [] if type(x) != str else ast.literal_eval(x))

# Remove "VEHICLE" entities and labels
for i in range(len(data)):
    while "VEHICLE" in data['labels'].iat[i]:
        vehicle_idx = data['labels'].iat[i].index("VEHICLE")
        data['entities'].iat[i] = data['entities'].iat[i][:vehicle_idx] + data['entities'].iat[i][vehicle_idx+1:]
        data['labels'].iat[i] = data['labels'].iat[i][:vehicle_idx] + data['labels'].iat[i][vehicle_idx+1:]

# Explode
data = pd.concat([data[['id','sample','entities']].explode('entities').reset_index(drop=True), data['labels'].explode('labels').reset_index(drop=True)], axis=1)

#save
data.to_csv(f'../gold_standard/processed/ner_ace_nltk.csv', index=False)

In [5]:
import csv


input_file_path = '../gold_standard/raw/ner_own_types.csv'
output_file_path = '../gold_standard/processed/ner_own_types.csv'

# Read the CSV file
with open(input_file_path, mode='r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    data = [row for row in reader]

# Process the data and write to a new CSV file
with open(output_file_path, mode='w', newline='') as csvfile:
    fieldnames = ['id', 'sample', 'entities', 'types']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for row in data:
        c5_unique_id = row['id']
        c119_text = row['sample']
        
        gs_list = eval(row['entities'])
        gs_type_list = eval(row['types'])
        
        for gs, gs_type in zip(gs_list, gs_type_list):
            writer.writerow({
                'id': c5_unique_id,
                'sample': c119_text,
                'entities': gs,
                'types': gs_type
            })

print(f"Processed data has been written to {output_file_path}")

Processed data has been written to ../gold_standard/processed/ner_own_types.csv
