# NER Experiments

In [None]:
!pip install stanza

In [None]:
!pip install ipywidgets

In [1]:
import os
import json
import stanza
import spacy
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

## Configuration

In [2]:
# Configure Stanza pipeline
# nlp_stanza = stanza.Pipeline('en')
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,ner')

# load spacy with large trained model for English
nlp_spacy = spacy.load('en_core_web_lg')

# Define entity types for the confusion matrix
entity_types = ["PERSON", "ORG", "LOC", "DATE", "MONEY"]

# List of common prefixes for persons
person_prefixes = ["Mr", "Mr.", "Mrs", "Mrs.", "Dr", "Dr.", "Ms", "Ms.", "Miss", "Prof.", "Doctor"]

2021-10-27 23:53:18 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-10-27 23:53:18 INFO: Use device: gpu
2021-10-27 23:53:18 INFO: Loading: tokenize
2021-10-27 23:53:20 INFO: Loading: ner
2021-10-27 23:53:21 INFO: Done loading processors!


## Utils

In [3]:
# discard some prefixes
def discard_prefix(entities):
    prefix = ["The ", "the "]
    results = []
    
    for e in entities:
        for p in prefix:
            if e[0].startswith(p):
                e[0] = e[0].replace(p,"")
                e[1] = e[1] + len(p)
        results.append(e)
    
    return results

In [4]:
def compare_results(y_true, y_preds, confusion_matrix):
    """
    This method compares the expected list of named entities(y_true) with the predicted ones. 
    @y_true: list of  [text, start, end, type]
    @y_preds: list of  [text, start, end, type] 
    """
    
    # discard the prefixes
    y_true = discard_prefix(y_true)
    y_preds = discard_prefix(y_preds)
    
    # found TP, FP, and FN lists
    tp_list = [x for x in y_true if x in y_preds]
    fp_list = [x for x in y_preds if x not in y_true]
    fn_list = [x for x in y_true if x not in y_preds]
    
    # update the confusion matrix
    for x in tp_list:
        if x[3] in entity_types:
            confusion_matrix.at[x[3], "TP"] += 1
    
    for x in fp_list:
        if x[3] in entity_types:
            confusion_matrix.at[x[3], "FP"] += 1
    
    for x in fn_list:
        if x[3] in entity_types:
            confusion_matrix.at[x[3], "FN"] += 1
    
    # return confusion matrix
    return confusion_matrix

## Process A Sentence

In [5]:
def process_sentence(pipeline, sentence, results):
    """
    This method runs the nlp pipeline and populates the results in the given results
    #sentence: annotate sentence
    @results: Dataframe(columns=["sentence", "expected", "predicted"])
    """
    
    # define s_preds and s_true which contain [text, start, end, type] for each named entity
    s_preds, s_true = [], []
    
    content = sentence["text"]
    
    # process the sentence with spaCy
    if pipeline == "spaCy":
        # run the nlp pipeline on the text
        doc = nlp_spacy(content)

        # add the list of predictions
        for ent in doc.ents:
            if ent.label_ in ["LOC", "GPE", "NORP"]:
                e_type = "LOC"
            else:
                e_type = ent.label_
            s_preds.append([ent.text, ent.start_char, ent.end_char, e_type])
    
    # process the sentence with stanza
    elif pipeline == "stanza":
        # run the stanza pipeline
        doc = nlp_stanza(content)
        
        # iterate through recognized entities
        for ent in doc.ents:
            if ent.type in ["LOC", "GPE", "NORP"]:
                e_type = "LOC"
            else:
                e_type = ent.type
            s_preds.append([ent.text, ent.start_char, ent.end_char, e_type])
            
    else:
        print(f' the pipeline is not defined')
    
    # build the s_true list which contains the true named entities
    for e in sentence["entities_char"]:
        if e[2] in ["LOC", "GPE", "NORP"]:
            e_type = "LOC"
        else:
            e_type = e[2]
        s_true.append([content[e[0]:e[1]], e[0], e[1], e_type])
    
    # add the results into the result raw data frame
    results = results.append({'sentence':content, 'expected':s_true, 'predicted':s_preds}, ignore_index = True)
    
    # return the raw results
    return results

## Process Dataset

In [6]:
# load the annotations from file
content = open("./Annotations/annotations_public.json", mode="r", encoding="utf-8").read()
annotations = json.loads(content)

### Run Stanza experiment

In [7]:
# to keep the result of confusion matrix for the exact match using the character-based positions
confusion_matrix = pd.DataFrame([[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0]],
                               index=entity_types, columns=["Total", "TP", "FP", "FN", "Precision", "Recall", "F-Measure"])

# details of results such as sentence, expected, and predicted ones
results = pd.DataFrame(columns=["sentence", "expected", "predicted"])

In [8]:
# iterate through the dataset
for annotate in annotations:
    # get the results for the current sentence, append it to the total results
    results = process_sentence("stanza", annotate, results)
    
# calculate the confusion matrix
for index, row in results.iterrows():
    confusion_matrix = compare_results(row["expected"], row["predicted"],confusion_matrix)
    
# calculate the total column of the confusion matrix
confusion_matrix["Total"] = confusion_matrix["TP"] + confusion_matrix["FN"] 

# Calculate performanc measures
confusion_matrix["Precision"] = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FP"])
confusion_matrix["Recall"] = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"])
confusion_matrix["F-Measure"] = 2*confusion_matrix["Precision"]*confusion_matrix["Recall"] / (confusion_matrix["Precision"] + confusion_matrix["Recall"])

In [9]:
confusion_matrix

Unnamed: 0,Total,TP,FP,FN,Precision,Recall,F-Measure
PERSON,11956,9000,1729,2956,0.838848,0.75276,0.793476
ORG,10493,4141,3294,6352,0.55696,0.394644,0.461959
LOC,11913,9805,4798,2108,0.671437,0.82305,0.739553
DATE,0,0,5384,0,0.0,,
MONEY,0,0,623,0,0.0,,


### Run spaCy experiment

In [10]:
# to keep the result of confusion matrix for the exact match using the character-based positions
confusion_matrix = pd.DataFrame([[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0],[0,0,0,0,0.0,0.0,0.0]],
                               index=entity_types, columns=["Total", "TP", "FP", "FN", "Precision", "Recall", "F-Measure"])

# details of results such as sentence, expected, and predicted ones
results = pd.DataFrame(columns=["sentence", "expected", "predicted"])

In [11]:
# iterate through the dataset
for annotate in annotations:
    # get the results for the current sentence, append it to the total results
    results = process_sentence("spaCy", annotate, results)
    
# calculate the confusion matrix
for index, row in results.iterrows():
    confusion_matrix = compare_results(row["expected"], row["predicted"],confusion_matrix)
    
# calculate the total column of the confusion matrix
confusion_matrix["Total"] = confusion_matrix["TP"] + confusion_matrix["FN"] 

# Calculate performanc measures
confusion_matrix["Precision"] = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FP"])
confusion_matrix["Recall"] = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"])
confusion_matrix["F-Measure"] = 2*confusion_matrix["Precision"]*confusion_matrix["Recall"] / (confusion_matrix["Precision"] + confusion_matrix["Recall"])

In [12]:
confusion_matrix

Unnamed: 0,Total,TP,FP,FN,Precision,Recall,F-Measure
PERSON,11956,8191,2065,3765,0.798654,0.685095,0.737529
ORG,10493,4414,3948,6079,0.527864,0.420661,0.468205
LOC,11913,9895,5331,2018,0.649875,0.830605,0.729209
DATE,0,0,7272,0,0.0,,
MONEY,0,0,744,0,0.0,,
