# NER

---

In [1]:
nlp_task = 'ner'

## Load Processed NER Gold Standard Data

In [6]:
import pandas as pd

# path to the gold standard file
task_gold_standard_path = f"../../gold_standard/processed/{nlp_task}.csv"

# load output from the tool to be evaluated
gs = pd.read_csv(task_gold_standard_path)

gs

Unnamed: 0,id,sample,entities
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,DITCH
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,TREE
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,LOST CONTROL
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",TAKEOFF
...,...,...,...
501,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,PILOT ERROR
502,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,PILOT
503,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL FILLER CAP
504,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL


## Load Processed NER for Tools to be  evaluated

In [3]:
def load_df(nlp_task, tool):
    # load 
    tool_path = f"../../tool_results/{nlp_task}/"
    df = pd.read_csv(tool_path+tool+'.csv')
    # standardize
    df = (df.reset_index(drop=True)).rename(columns={'c5_unique_id': 'id', 'c119_text': 'sample'}).drop(columns=['index'])
    return df


# load output from each tool to be evaluated
tools = ["flair", "nltk", "spacy", "stanza"]
df_tools = { tool: load_df(nlp_task, tool)   for tool in tools }

Visualize some samples for each tool

In [4]:
for tool in tools:
    print(tool)
    display(df_tools[tool].head(4))

flair


Unnamed: 0,id,sample,entities,labels
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT,"[{'value': 'ORG', 'confidence': 0.645385682582..."
1,20000215010329A,(-23) MR. TIMOTHY ALLEN WELLS WAS ACTING AS PI...,TIMOTHY ALLEN WELLS,"[{'value': 'PER', 'confidence': 0.903642654418..."
2,20070630826079I,(-23) AIRCRAFT DEPARTED RAY AIRPORT AND AFTER ...,RAY AIRPORT,"[{'value': 'LOC', 'confidence': 0.765102148056..."
3,19820725041999I,LOOSE COWLING ON TAKEOFF. COWLING CAME OFF ON ...,COWLING,"[{'value': 'PER', 'confidence': 0.999859929084..."


nltk


Unnamed: 0,id,sample,entities,POS tags,labels
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT,NNP,ORGANIZATION
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,WAS,NNP,ORGANIZATION
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,IT,NNP,ORGANIZATION
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,LOST,NNP,ORGANIZATION


spacy


Unnamed: 0,id,sample,entities,labels
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,CIRCUMSTANCES AE UNK,ORG
1,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...,2,CARDINAL
2,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...,CIRCUIT BREAKER,PERSON
3,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...,PUMP,ORG


stanza


Unnamed: 0,id,sample,entities,labels
0,19841214074599I,WHILE TAXIING LOST NOSEWHEEL STEERING AND BRAK...,2,CARDINAL
1,20000215010329A,(-23) MR. TIMOTHY ALLEN WELLS WAS ACTING AS PI...,TIMOTHY ALLEN,PERSON
2,19790718019229I,GROUND STAND BAGGAGE CART WITH INOPERATIVE BRA...,9,CARDINAL
3,20000625032189I,(-23)NORTHWEST AIRLINES DC-10 BOEING PUSHED BA...,23,CARDINAL


## Evaluation

To calculate precision and recall where `gs` is considered the ground truth and `tool` is the answers provided by some tool or method, we first need to define these metrics in the context of your entity similarity task:

- **Precision**: Of all the entities identified by `tool`, how many were correctly identified as per `gs`? This is calculated as the number of true positives (TP) divided by the number of true positives and false positives (TP + FP).

- **Recall**: Of all the relevant entities present in `gs`, how many were identified by `tool`? This is calculated as the number of true positives (TP) divided by the number of true positives and false negatives (TP + FN).

Here's a step-by-step approach:

1. **True Positives (TP)**: Entities in `tool` that match entities in `gs` (considering your similarity criteria) for the same `id`.
2. **False Positives (FP)**: Entities in `tool` that do not match any entity in `gs` for the same `id`.
3. **False Negatives (FN)**: Entities in `gs` that do not match any entity in `tool` for the same `id`.

In [24]:
def is_similar_entity(gs_entity, tool_entities):
    """
    Check if the gs_entity is similar to any of the entities in the tool_entities list.
    An entity is considered similar if it's a substring of any entity in the list, or vice versa.
    
    Parameters:
    - gs_entity: The entity from the gs DataFrame.
    - tool_entities: A list of entities from the df_tool DataFrame for a given id.
    
    Returns:
    - True if similar entity is found, False otherwise.
    """
    for tool_entity in tool_entities:
        if gs_entity in tool_entity or tool_entity in gs_entity:
            return True
    return False

def check_entity_similarity(gs, df_tool):
    """
    For each row in the gs DataFrame, check if the entity is similar to any entity in the df_tool,
    considering only rows with the same id.
    
    Parameters:
    - gs: The ground truth DataFrame with columns ['id', 'sample', 'entities'].
    - df_tool: The tool DataFrame with columns ['id', 'sample', 'entities', 'POS tags', 'labels'].
    
    Returns:
    - The gs DataFrame with an additional 'Similarity' column indicating if a similar entity was found in df_tool.
    """
    # Initialize an empty list to store similarity results
    similarities = []
    
    # Iterate through each row in gs
    for index, row in gs.iterrows():
        # Extract the id and entity for the current row
        gs_id, gs_entity = row['id'], row['entities']
        
        # Find entities in df_tool with the same id
        tool_entities = df_tool.loc[df_tool['id'] == gs_id, 'entities'].tolist()
        
        # Check for similarity and append the result
        similarities.append(is_similar_entity(gs_entity, tool_entities))
        
    # Add the similarity results to the gs DataFrame
    gs['Similarity'] = similarities
    
    return gs


def calculate_precision_recall_f1(gs, df_tool):
    """
    Calculate precision and recall based on entities comparison between gs (ground truth) and df_tool (answers).
    
    Parameters:
    - gs: DataFrame with columns ['id', 'sample', 'entities'] representing the ground truth.
    - df_tool: DataFrame with columns ['id', 'sample', 'entities', 'POS tags', 'labels'] representing the tool's answers.
    
    Returns:
    - A tuple containing precision and recall.
    """
    TP = 0  # True Positives
    FP = 0  # False Positives
    FN = 0  # False Negatives
    
    # Check for True Positives and False Negatives by iterating over gs
    for index, gs_row in gs.iterrows():
        gs_id, gs_entity = gs_row['id'], gs_row['entities']
        tool_entities = df_tool.loc[df_tool['id'] == gs_id, 'entities'].tolist()
        
        if any(gs_entity in tool_entity or tool_entity in gs_entity for tool_entity in tool_entities):
            TP += 1
        else:
            FN += 1
    
    # Check for False Positives by iterating over df_tool
    for index, tool_row in df_tool.iterrows():
        tool_id, tool_entity = tool_row['id'], tool_row['entities']
        gs_entities = gs.loc[gs['id'] == tool_id, 'entities'].tolist()
        
        if not any(tool_entity in gs_entity or gs_entity in tool_entity for gs_entity in gs_entities):
            FP += 1
    
    # Calculate precision and recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # Calculating the F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score


def evaluate_nlr(gs, df_tools, tools):
    """
    Evaluate NLR tools by calculating precision, recall, and F1 score, and return a sorted and rounded DataFrame.
    
    Parameters:
    - gs: The ground truth DataFrame with columns ['id', 'sample', 'entities'].
    - df_tools: A dictionary with DataFrames for each tool, where each DataFrame contains ['id', 'sample', 'entities', ...].
    - tools: A list of tool names corresponding to keys in df_tools.
    
    Returns:
    - A DataFrame with each tool's precision, recall, and F1 score, sorted by F1 score in descending order and rounded to 2 decimal places.
    """
    results = []
    for tool in tools:
        precision, recall, f1_score = calculate_precision_recall_f1(gs, df_tools[tool])  # Assume this function is defined
        results.append({
            'Tool': tool,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1_score
        })

    results_df = pd.DataFrame(results)
    results_df_sorted = results_df.sort_values(by='F1 Score', ascending=False)
    
    return results_df_sorted.round(2)

In [25]:
results_df_sorted = evaluate_nlr(gs, df_tools, tools)
results_df_sorted


Unnamed: 0,Tool,Precision,Recall,F1 Score
1,nltk,0.52,0.46,0.49
2,spacy,0.75,0.2,0.32
0,flair,0.91,0.08,0.15
3,stanza,0.84,0.07,0.13


Show some matches 

In [8]:
check_entity_similarity(gs, df_tools['nltk'])

Unnamed: 0,id,sample,entities,Similarity
0,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,ACFT,True
1,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,DITCH,True
2,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,TREE,False
3,19990213001379A,ACFT WAS TAXIING FOR TAKE OFF WHEN IT LOST CON...,LOST CONTROL,True
4,19800217031649I,"AFTER TAKEOFF, ENGINE QUIT. WING FUEL TANK SUM...",TAKEOFF,False
...,...,...,...,...
501,19870523018729A,CANOPY CAME OPEN ON CLIMBOUT CAUSING AN UNCONT...,PILOT ERROR,False
502,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,PILOT,True
503,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL FILLER CAP,True
504,20030620012809I,(-23) PILOT FAILED TO ASSURE THE OIL FILLER CA...,OIL,True
