# Prepare and standardize the gold standard data

From ``../gold_standard/raw/*.cvs`` to ``../gold_standard/procesed/*.csv``

## Gold standard samples
---

In [1]:
import pandas as pd

gold_standard_path = '../gold_standard/raw/samples.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# rename columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample"}, inplace=True)

# save the processed tool output to a file
data.to_csv('../gold_standard/processed/samples.csv', index=False)

data

Unnamed: 0,id,sample
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...
1,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM..."
2,19790720021329A,HELICOPTER TOOK OFF WITH SLING LOAD ATTACHED. ...
3,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...
4,19860128014289I,FORWARD CARGO DOOR OPENED AS AIRCRAFT TOOK OFF...
...,...,...
95,19880527016939A,ENGINE QUIT ON INITIAL CLIMBOUT. CRASH LANDED ...
96,19960418007829A,CRASHED AND BURNED. (.4)WITNESSES REPORTED THA...
97,19970828026989A,LOUD POP ON TAKEOFF ROLL. LOST RUDDER CONTROL....
98,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...


## Gold standard NEL data
---

In [9]:
import pandas as pd

gold_standard_path = '../gold_standard/raw/nel.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# rename columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample", "human_gold_ents":"entity",  "qids": "qid"}, inplace=True)

# Removing 'Q' from the 'qid' column
data['qid'] = data['qid'].str.replace('Q', '', regex=False)

data['qid'].fillna(-1, inplace=True)

# change the column type to object
data['qid'] = data['qid'].astype('int64')

# Save the processed tool output to a file
data[['id','sample','entity','qid']].to_csv('../gold_standard/processed/nel.csv', index=False)
data


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['qid'].fillna(-1, inplace=True)


Unnamed: 0,index,id,sample,entity,qid,links,confirmed,Notes
0,2318,19990213001379A,"ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK",ACFT,11436,https://www.wikidata.org/wiki/Q11436,True,ACFT actuallly stands for Army Combat Fitness Test with a QID of Q67935434
1,2318,19990213001379A,"ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK",DITCH,2048319,https://www.wikidata.org/wiki/Q2048319,True,
2,2318,19990213001379A,"ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK",TREE,10884,https://www.wikidata.org/wiki/Q10884,True,
3,2318,19990213001379A,"ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CONTROL, RAN INTO A DITCH, AND STRUCK A TREE. OTHER CIRCUMSTANCES AE UNK",LOST CONTROL,29017603,https://www.wikidata.org/wiki/Q29017603,True,"no qid for ""lost control"" given qid is for control"
4,354,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUMPS WERE NOT DRAINED DURING PREFLIGHT BECAUSE THEY WERE FROZEN.",TAKEOFF,854248,https://www.wikidata.org/wiki/Q854248,True,
...,...,...,...,...,...,...,...,...
496,1210,19870523018729A,"CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONTROLLED GROUND COLLISION, NO DEFECTS REPORTED. APPEARS PILOT ERROR.",PILOT ERROR,3057459,https://www.wikidata.org/wiki/Q3057459,True,
497,2488,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CAP WAS SECURE AND TOOK OFF. OIL WAS OBSERVED COMING OUT OF THE ENGINE,PILOT,2095549,https://www.wikidata.org/wiki/Q2095549,True,
498,2488,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CAP WAS SECURE AND TOOK OFF. OIL WAS OBSERVED COMING OUT OF THE ENGINE,OIL FILLER CAP,-1,,False,no qid found
499,2488,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CAP WAS SECURE AND TOOK OFF. OIL WAS OBSERVED COMING OUT OF THE ENGINE,OIL,42962,https://www.wikidata.org/wiki/Q42962,True,


## Gold standard RE data
---

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

gold_standard_path = '../gold_standard/raw/re.csv'

# load output from the tool to be evaluated
data = pd.read_csv(gold_standard_path)

# get the columns of interest
data = data[['c5_unique_id', 'c119_text', 'subject(Label), relation_type(Label), object(Label) ']]

# rename the columns
data.rename(columns={"c5_unique_id": "id", "c119_text": "sample", 'subject(Label), relation_type(Label), object(Label) ':"triple"}, inplace=True)

# Explode the 'triple' column on '\n'
data = data.assign(triple=data['triple'].str.split('\n')).explode('triple')

# Splitting the 'triple' column into 'subject', 'relation', and 'object' and their respective types
split_columns = data['triple'].str.extract('(.*) \((.*)\), (.*) \((.*)\), (.*) \((.*)\)')
data[['subject', 'subject_type', 'relation', 'relation_type', 'object', 'object_type']] = split_columns

# Show the modified DataFrame
data = data[['id', 'subject', 'subject_type', 'relation', 'relation_type', 'object', 'object_type']]

# Save the processed tool output to a file
data.to_csv('../gold_standard/processed/re.csv', index=False)

data

Unnamed: 0,id,subject,subject_type,relation,relation_type,object,object_type
0,19990213001379A,Aircraft,Object/Subject,was performing,Activity/Operation,Taxiing for Take Off,Activity/Operation
0,19990213001379A,Aircraft,Object/Subject,lost,Negative Outcome,Control,Incident Description
0,19990213001379A,Aircraft,Object/Subject,ran into,Collision,Ditch,Object/Obstacle
0,19990213001379A,Aircraft,Object/Subject,struck,Collision,Tree,Object/Obstacle
0,19990213001379A,Other Circumstances,Contextual Information,status,Condition,Unknown,Status/Detail Type
1,19800217031649I,Engine,Component,quit after,Temporal,Takeoff,Phase/Operation
1,19800217031649I,Wing Fuel Tank Sumps,Component,were not drained during,Condition/Action,Preflight,Phase/Operation
1,19800217031649I,Wing Fuel Tank Sumps,Component,condition,Condition,Frozen,Condition
1,19800217031649I,Drained,Action,prevented by,Causal,Frozen,Condition
2,19790720021329A,Helicopter,Aircraft type,took off with,Action,Sling Load,Cargo
