In [1]:
import pandas as pd
import numpy as np
import json
import random
import spacy
from tqdm import tqdm
import pickle
import plotly.graph_objects as go
from plotly.offline import iplot

In [2]:
import math
from sklearn.model_selection import train_test_split

In [3]:
from spacy.tokens import Doc

In [4]:
from spacy.training import Example

In [5]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Task description

- train NER model to extract geo, gpe, tim and nat entities
- experiment with different language model sizes
- experiment with DROPOUT and epochs to get the best test set results
- analize learning curves and performance per entity

News entities
- geo = Geographical Entity
- gpe = Geopolitical Entity
- tim = Time indicator
- nat = Natural Phenomenon

# Prepare NER data

In [6]:
with open('data/GMB_data_spacy_geo.pickle', 'rb') as f:
     spacy_data = pickle.load(f)

In [82]:
len(spacy_data)

35177

In [85]:
spacy_data[33000][1]

{'entities': [(41, 47, 'gpe')]}

In [86]:
spacy_data[35000][0]

"Lebanon 's top Shi'ite cleric is opposing British Prime Minister Tony Blair 's expected visit to Beirut Monday ."

## Test train split

In [None]:
# Divide spacy into train and test sets
spacy_data_train, spacy_data_test = train_test_split(spacy_data, test_size=0.1, random_state=42)

In [None]:
spacy_data_train[1]

## NER model setup

In [7]:
# Setup model - experiment with sm, md, lg
nlp = spacy.load('en_core_web_sm')
ner = nlp.create_pipe('ner')



In [None]:
def split_examples_to_batches(examples, batch_size):
    batches=[]
    for i in range(0, math.ceil(len(examples)/batch_size)):
        start=i*batch_size
        end = start+batch_size
        batches.append(examples[start:end])
    return batches

In [None]:
def prepare_examples(data):
    examples = []
    for i in range(0, len(data)):
        raw_text, entity_offsets = data[i]

        try:

            doc= nlp.make_doc(raw_text.lower())
            example_test = Example.from_dict(doc, entity_offsets)
            examples.append(example_test)
           
        except Exception as e:
            print(e)
            pass
    return examples

In [None]:
# prepare examples
examples_train = prepare_examples(spacy_data_train)
examples_test = prepare_examples(spacy_data_test)

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Add labels

In [90]:
for _, annotations in spacy_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [91]:
ner.labels

('geo', 'gpe', 'nat', 'tim')

# Train model

In [None]:
DROPOUT=0.1
epochs = 10

batch_size=128

In [None]:
def train_ner(nlp, examples_train, examples_test, epochs, batch_size, dropout):
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.create_optimizer()
        scores = []

        for i in range(0, epochs):
            random.shuffle(examples_train)

            batches = split_examples_to_batches(examples_train, batch_size)
            for batch in  tqdm(batches):
                try:
                        nlp.update(batch, sgd=optimizer, drop =DROPOUT)
                except Exception as e:
                    print(e)
                    pass
            train_score = nlp.evaluate(examples_train)
            val_score = nlp.evaluate(examples_test)
            total_f = val_score['ents_f']

            scores.append({"iter":i, "val_score":val_score, "train_score":train_score})

            print(f"Iter:{i}, f_score:{round(total_f,2)}")

    return nlp, scores

In [None]:
nlp, scores = train_ner(nlp, examples_train, examples_test, epochs, batch_size, DROPOUT)

# Model save/load

In [94]:
def save_spacy_model(nlp, model_path):
    nlp.to_disk(f'{model_path}')
    print(f"Saved model to {model_path}")
    bytes_data = nlp.to_bytes()

    f = open(f'{model_path}/bytes_data.bin', 'wb')
    f.write(bytes_data)
    f.close()
    print(f"Saved bytes_data to f'{model_path}/bytes_data.bin'")

In [95]:
save_spacy_model(nlp, "that_one_model_which_is_diff")

Saved model to that_one_model_which_is_diff
Saved bytes_data to f'that_one_model_which_is_diff/bytes_data.bin'


In [8]:
def load_spacy_model(model_path, base_model = "en_core_web_sm"):

    nlp = spacy.load(base_model)
    file = open(f'{model_path}/bytes_data.bin',"rb")
    bytes_data = file.read()
    config = nlp.config
    lang_cls = spacy.util.get_lang_class("en")
    nlp = lang_cls.from_config(config)
    nlp = nlp.from_disk(f'{model_path}')
    
    return nlp

In [9]:
nlp2 = load_spacy_model("ner_sm_do005_ep10_bs128")

## Evaluate model

In [12]:
scores

NameError: name 'scores' is not defined

In [13]:
def get_metric_per_epoch(scores, score_metric):
    scores_list = []
    for item in scores:
        epoch_score ={"epoch":item["iter"],
                     "train_score":item["train_score"][f"ents_{score_metric}"],
                     "val_score":item["val_score"][f"ents_{score_metric}"]}

        scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores
    

In [14]:
def get_metric_per_entity(scores, score_metric):
    # Create a function to transform scores into dataframe
    
    scores_list = []
    for item in scores:
        epoch =item["iter"]
        
        for ent in item["val_score"]['ents_per_type'].keys():
            epoch_score ={
                "epoch":epoch,
                "ent":ent,
                "train_score":item["train_score"]['ents_per_type'][ent][score_metric],
                "val_score":item["val_score"]['ents_per_type'][ent][score_metric]}
            scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores       
                

In [None]:
df_scores = get_metric_per_entity(scores, "f")

In [None]:
df_scores

In [None]:
data=[]
for score in ["train_score", "val_score"]:
    
    trace=go.Scatter(
                x=df_scores.epoch,
                y=df_scores[score],
                mode='lines',
                marker=dict(
                size=5
                ),
            name=score,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title=f"<b>Performance by epoch",
    
    ))
iplot(figure)

# Make predictions

In [10]:
text = '''Donald Trump mistook E Jean Carroll, the writer who accuses him of rape, for his ex-wife Marla Maples during a deposition in the case last year, excerpts released in US district court on Wednesday showed.

“That’s Marla, yeah,” Trump said, when shown a photograph. “That’s my wife.”

The mistake was corrected by a lawyer for the 76-year-old former president. But observers said it could undermine Trump’s claim he could not have attacked Carroll because she is not his “type”.

It was not the first release of excerpts from Trump’s deposition, which happened in October. Last week, Trump was shown to have claimed Carroll “said it was very sexy to be raped”.

Carroll says Trump raped her in a department store changing room in the mid-1990s. Trump denies it.

Carroll sued Trump for defamation and under the Adult Survivors Act, a New York law which allows alleged victims of historical sexual assault to bring cases within a defined timeframe.

Trump was married to Maples, the mother of his daughter Tiffany, from 1993 to 1999, between marriages to Ivana Trump, his first wife, and Melania Trump, his third and current spouse.

The photograph he thought showed Maples shows Trump in Carroll’s company in the 1990s. In his deposition, Trump said it showed a “receiving line” at an event.'''

In [11]:
doc = nlp(text)    
for ent in doc.ents:
    print(f"{ent.label_} : {ent.text}")

PERSON : Donald Trump
PERSON : Jean Carroll
PERSON : Marla Maples
DATE : last year
GPE : US
DATE : Wednesday
DATE : 76-year-old
ORG : Trump
PERSON : Carroll
ORDINAL : first
ORG : Trump
DATE : October
DATE : Last week
ORG : Trump
PERSON : Carroll
ORG : Trump
DATE : the mid-1990s
LAW : the Adult Survivors Act
GPE : New York
NORP : Maples
PERSON : Tiffany
DATE : 1993
DATE : 1999
ORG : Ivana Trump
ORDINAL : first
PERSON : Melania Trump
ORDINAL : third
PRODUCT : Maples
ORG : Trump
ORG : Carroll
DATE : the 1990s
ORG : Trump


In [81]:
listent

array(['PERSON', 'DATE', 'GPE', 'ORG', 'ORDINAL', 'LAW', 'NORP',
       'PRODUCT'], dtype=object)

In [62]:
rang = len(obj[1]['entities'])
listent = []
for i in range(rang):
    listent.append(obj[1]['entities'][i][2])

listent = pd.unique(listent)

len(listent)

coldi = {'PURPLE' : '\033[95m\033[1m',
   'CYAN' : '\033[96m\033[1m',
   'DARKCYAN' : '\033[36m\033[1m',
   'BLUE' : '\033[94m\033[1m',
   'GREEN' : '\033[92m\033[1m',
   'YELLOW' : '\033[93m\033[1m',
   'RED' : '\033[91m\033[1m',
   'GRAY' : '\033[90m\033[1m',
}
colors = list(coldi.values())


colmap = dict(zip(listent, colors))
colmap


{'PERSON': '\x1b[95m\x1b[1m',
 'DATE': '\x1b[96m\x1b[1m',
 'GPE': '\x1b[36m\x1b[1m',
 'ORG': '\x1b[94m\x1b[1m',
 'ORDINAL': '\x1b[92m\x1b[1m',
 'LAW': '\x1b[93m\x1b[1m',
 'NORP': '\x1b[91m\x1b[1m',
 'PRODUCT': '\x1b[90m\x1b[1m'}

In [78]:
colmap = {'PERSON': '\x1b[95m\x1b[1m',
 'DATE': '\x1b[96m\x1b[1m',
 'GPE': '\x1b[36m\x1b[1m',
 'ORG': '\x1b[94m\x1b[1m',
 'ORDINAL': '\x1b[92m\x1b[1m',
 'LAW': '\x1b[93m\x1b[1m',
 'NORP': '\x1b[91m\x1b[1m',
 'PRODUCT': '\x1b[90m\x1b[1m'}

def highlight_entities(text, ents):
    #blue_bold_char = '\033[94m\033[1m'
    back_to_normal = '\033[0m'

    previous_end=0
    text_h =""
    for ent in ents["entities"]:

        start=ent[0]
        end=ent[1]
        ent_val = text[start:end]
        text_h = text_h + text[previous_end:start]+colmap[ent[2]] +ent_val+back_to_normal 
        previous_end = end
    text_h = text_h+text[previous_end:]
    print(text_h)


In [79]:
def text_to_entities(text, nlp):
    doc = nlp(text)   
    ents = doc.ents
    entites = []
    for ent in ents:
        entites.append((ent.start_char, ent.end_char, ent.label_))
        
    output = (text, {"entities":entites})
    highlight_entities(output[0], output[1])
    return output

In [92]:
obj = text_to_entities(text, nlp)

[95m[1mDonald Trump[0m mistook E [95m[1mJean Carroll[0m, the writer who accuses him of rape, for his ex-wife [95m[1mMarla Maples[0m during a deposition in the case [96m[1mlast year[0m, excerpts released in [36m[1mUS[0m district court on [96m[1mWednesday[0m showed.

“That’s Marla, yeah,” Trump said, when shown a photograph. “That’s my wife.”

The mistake was corrected by a lawyer for the [96m[1m76-year-old[0m former president. But observers said it could undermine [94m[1mTrump[0m’s claim he could not have attacked [95m[1mCarroll[0m because she is not his “type”.

It was not the [92m[1mfirst[0m release of excerpts from [94m[1mTrump[0m’s deposition, which happened in [96m[1mOctober[0m. [96m[1mLast week[0m, [94m[1mTrump[0m was shown to have claimed [95m[1mCarroll[0m “said it was very sexy to be raped”.

Carroll says [94m[1mTrump[0m raped her in a department store changing room in [96m[1mthe mid-1990s[0m. Trump denies it.

Carroll sued Trum

In [None]:
def save_spacy_model(nlp, model_path):
    nlp.to_disk(f'{model_path}')
    print(f"Saved model to {model_path}")
    bytes_data = nlp.to_bytes()

    f = open(f'{model_path}/bytes_data.bin', 'wb')
    f.write(bytes_data)
    f.close()
    print(f"Saved bytes_data to f'{model_path}/bytes_data.bin'")

In [None]:
save_spacy_model(nlp, "ner_per_and_org_3_do_05_e_10_bs_32_md")