# NER on Wiki dataset using Histo NER model (Glove)

## Imports

In [None]:
!pip install tf2crf pandarallel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/mhc/')

In [None]:
%cd "/content/drive/My Drive/Colab Notebooks/mhc/"

In [1]:
import pickle
from pathlib import Path
import pandas as pd
import numpy as np

from model import BilstmCrf
from text import PreProcessing

import spacy
from spacy import displacy

from sklearn.metrics import classification_report

from tqdm import tqdm
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Pre-processing

Apply NER to wiki sentences using Spacy NER and our TensorFlow 2 model trained on Histo dataset

In [2]:
nlp = spacy.load("en_core_web_md")

In [None]:
df = pd.read_csv("data/wiki/wiki.csv", sep="|")
df.Abstract = df.Abstract.parallel_apply(lambda x: PreProcessing.small_clean(x))
df.reset_index(drop=True, inplace=True)

In [None]:
def parse_data(data):
    sentences, labels, sentence, tag = [], [], [], []

    for text in data:
        doc = nlp(text)

        for sent in doc.sents:
            for word in sent:
                sentence.append(word.text)
                if word.ent_type_ is not "":
                    tag.append(word.ent_type_)
                else:
                    tag.append(word.ent_iob_)
        
            sentences.append(" ".join(sentence))
            labels.append(" ".join(tag))

            sentence, tag = [], []
    return sentences, labels

In [None]:
X, y = parse_data(tqdm(df.Abstract.values))

In [None]:
with Path("data/wiki/wiki.pkl").open("wb") as f:
    pickle.dump((X, y), f)

In [3]:
with Path("data/wiki/wiki.pkl").open("rb") as f:
    X, y = pickle.load(f)

In [4]:
embeddings = np.load("data/histo/gloveemb.npz")['embeddings']
model = BilstmCrf()

model.restore_model(embeddings, "serialized/glove/")

In [5]:
preds = model.predict(X, print=False)

Map both predictions to the same labels and save them to disk

In [6]:
mapping_spacy = {
    "PERSON": "O",
    "NORP": "O",
    "FAC": "O",
    "ORG": "O",
    "GPE": "O",
    "LOC": "O",
    "PRODUCT": "O",
    "WORK_OF_ART": "O",
    "LAW": "O",
    "LANGUAGE": "O",
    "PERCENT": "O",
    "MONEY": "O",
    "QUANTITY": "O",
    "CARDINAL": "O",
    "ORDINAL": "O",
    "O": "O",
    "EVENT": "EVENT",
    "TIME": "TIME",
    "DATE": "DATE"
}

mapping_histo = {
    "AUTHORITYLAW": "O",
    "CLOTHES": "O",
    "COMMUNICATION": "O",
    "EDUCATION": "O",
    "EMOTIONSEVALUATIONS": "O",
    "ENTERTAINMENTART": "O",
    "ENVIRONMENT": "O",
    "FAITH": "O",
    "FOODFARMING": "O",
    "LIFEHEALTH": "O", 
    "MATTER": "O",
    "MEASURE": "O", 
    "MENTAL": "O", 
    "O": "O",
    "PHYSICALSENSATIONS": "O", 
    "POSSESSION": "O", 
    "SOCIAL": "O",
    "SPACEMOVEMENT": "O", 
    "TRADEWORK": "O",
    "EXISTENCECAUSATION": "EVENT", 
    "HOSTILITY": "EVENT",
    "TIME": "TIME"
}

In [7]:
preds_spacy, y_mapped = [], []

for i, sent in enumerate(X):
    x = X[i].strip().split()
    lbl = y[i].strip().split()

    tmp = []
    for j, word in enumerate(x):
        
        if lbl[j] in mapping_spacy.keys():
            tmp.append((x[j], mapping_spacy[lbl[j]]))
            y_mapped.append(mapping_spacy[lbl[j]])
        else:
            tmp.append((x[j], "O"))
            y_mapped.append("O")
        
    preds_spacy.append(tmp)

In [8]:
preds_flat, preds_histo = [], []
for sent in preds:
    tmp = []
    for word, lbl in sent:
        lbl_ = "O" if lbl is "" else lbl
        lbl_ = lbl_.replace("B-", "").replace("I-", "")
        
        if lbl_ in mapping_histo.keys():
            preds_flat.append(mapping_histo[lbl_])
            tmp.append((word, mapping_histo[lbl_]))
        else:
            preds_flat.append("O")
            tmp.append((word, "O"))
        
    preds_histo.append(tmp)

In [16]:
with Path("data/wiki/preds_histo.pkl").open("wb") as f:
    pickle.dump(preds_histo, f)

with Path("data/wiki/preds_spacy.pkl").open("wb") as f:
    pickle.dump(preds_spacy, f)

In [17]:
with Path("data/wiki/preds_spacy.pkl").open("rb") as f:
    preds_spacy = pickle.load(f)

with Path("data/wiki/preds_histo.pkl").open("rb") as f:
    preds_histo = pickle.load(f)

In [18]:
print(classification_report(y_mapped, preds_flat))

              precision    recall  f1-score   support

        DATE       0.00      0.00      0.00      9963
       EVENT       0.03      0.03      0.03      4507
           O       0.94      0.97      0.96    247158
        TIME       0.00      0.00      0.00       367

    accuracy                           0.92    261995
   macro avg       0.24      0.25      0.25    261995
weighted avg       0.89      0.92      0.90    261995



In [19]:
def convert_for_spacyviz(data):
    sp_preds = []
    for sent in data:
        ent_sent = {}
        ents = []
        str_sent = ""

        for word, lbl in sent:
            if lbl is not "O":
                start = len(str_sent)
                end = start + len(word)
                ents.append({"start": start, "end": end, "label": lbl})

            str_sent = str_sent + word + " "
        
        ents = merge_ents(ents) if len(ents) > 0 else ents 
    
        sp_preds.append({"text": str_sent.strip(), "ents": ents, "title": None})

    return sp_preds

def merge_ents(ents):
    ents_ = []
    for i, e in enumerate(ents):
        if i != 0:
            if ents[i-1]["end"] + 1 == ents[i]["start"] and ents[i-1]["label"] == ents[i]["label"]:
                ents_[-1]["end"] = ents[i]["end"]
            else:
                ents_.append(ents[i])
        else:
            ents_.append(ents[i])
    
    return ents_

In [47]:
r = np.random.randint(0, len(preds_histo), 3)

In [48]:
sp_preds_histo = convert_for_spacyviz([preds_histo[i] for i in r])
sp_preds_spacy = convert_for_spacyviz([preds_spacy[i] for i in r])

In [52]:
displacy.render(sp_preds_histo, style="ent", jupyter=True, manual=True)

In [50]:
displacy.render(sp_preds_spacy, style="ent", jupyter=True, manual=True)