In [1]:
import pandas as pd
import numpy as np
import json
import random
import spacy
from tqdm import tqdm
import pickle
import plotly.graph_objects as go
from plotly.offline import iplot

In [2]:
import math
from sklearn.model_selection import train_test_split

In [3]:
from spacy.tokens import Doc

In [4]:
from spacy.training import Example

In [5]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Task description

- train NER model to extract geo, gpe, tim and nat entities
- experiment with different language model sizes
- experiment with DROPOUT and epochs to get the best test set results
- analize learning curves and performance per entity

News entities
- geo = Geographical Entity
- gpe = Geopolitical Entity
- tim = Time indicator
- nat = Natural Phenomenon

# Prepare NER data

In [6]:
with open('../data/GMB_data_spacy_geo.pickle', 'rb') as f:
     spacy_data = pickle.load(f)

In [7]:
len(spacy_data)

35177

In [8]:
spacy_data[36][1]

{'entities': []}

In [9]:
spacy_data[35000][0]

"Lebanon 's top Shi'ite cleric is opposing British Prime Minister Tony Blair 's expected visit to Beirut Monday ."

## Test train split

In [10]:
# Divide spacy into train and test sets
spacy_data_train, spacy_data_test = train_test_split(spacy_data, test_size=0.1, random_state=42)

In [11]:
spacy_data_train[1]

('U.S. health officials said Friday that trials found only a small number of cases where adverse reactions might be attributed to Nevirapine .',
 {'entities': [(0, 4, 'geo'), (27, 33, 'tim')]})

## NER model setup

In [12]:
# Setup model - experiment with sm, md, lg
nlp = spacy.load('en_core_web_sm')
ner = nlp.create_pipe('ner')



In [13]:
def split_examples_to_batches(examples, batch_size):
    batches=[]
    for i in range(0, math.ceil(len(examples)/batch_size)):
        start=i*batch_size
        end = start+batch_size
        batches.append(examples[start:end])
    return batches

In [14]:
def prepare_examples(data):
    examples = []
    for i in range(0, len(data)):
        raw_text, entity_offsets = data[i]

        try:

            doc= nlp.make_doc(raw_text.lower())
            example_test = Example.from_dict(doc, entity_offsets)
            examples.append(example_test)
           
        except Exception as e:
            print(e)
            pass
    return examples

In [15]:
# prepare examples
examples_train = prepare_examples(spacy_data_train)
examples_test = prepare_examples(spacy_data_test)

In [16]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Add labels

In [17]:
for _, annotations in spacy_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [18]:
ner.labels

('geo', 'gpe', 'nat', 'tim')

In [19]:
spacy_data[12050][1]

{'entities': [(17, 21, 'geo'), (24, 30, 'geo')]}

# Train model

In [20]:
DROPOUT=0.2
epochs = 10

batch_size=64

In [21]:
def train_ner(nlp, examples_train, examples_test, epochs, batch_size, dropout):
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.create_optimizer()
        scores = []

        for i in range(0, epochs):
            random.shuffle(examples_train)

            batches = split_examples_to_batches(examples_train, batch_size)
            for batch in  tqdm(batches):
                try:
                        nlp.update(batch, sgd=optimizer, drop =DROPOUT)
                except Exception as e:
                    print(e)
                    pass
            train_score = nlp.evaluate(examples_train)
            val_score = nlp.evaluate(examples_test)
            total_f = val_score['ents_f']

            scores.append({"iter":i, "val_score":val_score, "train_score":train_score})

            print(f"Iter:{i}, f_score:{round(total_f,2)}")

    return nlp, scores

In [22]:
nlp, scores = train_ner(nlp, examples_train, examples_test, epochs, batch_size, DROPOUT)

100%|██████████| 495/495 [03:13<00:00,  2.55it/s]


Iter:0, f_score:0.87


100%|██████████| 495/495 [03:08<00:00,  2.63it/s]


Iter:1, f_score:0.88


100%|██████████| 495/495 [02:58<00:00,  2.77it/s]


Iter:2, f_score:0.88


100%|██████████| 495/495 [02:38<00:00,  3.12it/s]


Iter:3, f_score:0.89


100%|██████████| 495/495 [04:24<00:00,  1.87it/s]


Iter:4, f_score:0.89


100%|██████████| 495/495 [02:30<00:00,  3.28it/s]


Iter:5, f_score:0.89


100%|██████████| 495/495 [02:57<00:00,  2.79it/s]


Iter:6, f_score:0.88


100%|██████████| 495/495 [02:29<00:00,  3.32it/s]


Iter:7, f_score:0.89


100%|██████████| 495/495 [03:03<00:00,  2.70it/s]


Iter:8, f_score:0.89


100%|██████████| 495/495 [02:41<00:00,  3.07it/s]


Iter:9, f_score:0.89


# Model save/load

In [23]:
def save_spacy_model(nlp, model_path):
    nlp.to_disk(f'{model_path}')
    print(f"Saved model to {model_path}")
    bytes_data = nlp.to_bytes()

    f = open(f'{model_path}/bytes_data.bin', 'wb')
    f.write(bytes_data)
    f.close()
    print(f"Saved bytes_data to f'{model_path}/bytes_data.bin'")

In [24]:
save_spacy_model(nlp, "ner_sm_do02_ep10_bs64")

Saved model to ner_sm_do02_ep10_bs64
Saved bytes_data to f'ner_sm_do02_ep10_bs64/bytes_data.bin'


In [29]:
def load_spacy_model(model_path, base_model = "en_core_web_sm"):

    nlp = spacy.load(base_model)
    file = open(f'{model_path}/bytes_data.bin',"rb")
    bytes_data = file.read()
    config = nlp.config
    lang_cls = spacy.util.get_lang_class("en")
    nlp = lang_cls.from_config(config)
    nlp = nlp.from_disk(f'{model_path}')
    
    return nlp

In [57]:
nlp2 = load_spacy_model("ner_sm_do005_ep10_bs128")

## Evaluate model

In [25]:
scores

[{'iter': 0,
  'val_score': {'token_acc': 1.0,
   'token_p': 1.0,
   'token_r': 1.0,
   'token_f': 1.0,
   'ents_p': 0.8778372393430522,
   'ents_r': 0.8630261248185777,
   'ents_f': 0.8703686762418809,
   'ents_per_type': {'geo': {'p': 0.8487665355738291,
     'r': 0.8499820981023989,
     'f': 0.8493738819320213},
    'gpe': {'p': 0.9381974248927039,
     'r': 0.9231418918918919,
     'f': 0.9306087696892295},
    'tim': {'p': 0.8853809196980096,
     'r': 0.8486842105263158,
     'f': 0.86664427275781},
    'nat': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
   'speed': 19579.90688241909},
  'train_score': {'token_acc': 1.0,
   'token_p': 1.0,
   'token_r': 1.0,
   'token_f': 1.0,
   'ents_p': 0.8830795926377326,
   'ents_r': 0.8739803324269622,
   'ents_f': 0.8785064014303036,
   'ents_per_type': {'geo': {'p': 0.8534991232265264,
     'r': 0.8640361494391996,
     'f': 0.8587353141665666},
    'tim': {'p': 0.8888888888888888,
     'r': 0.8615569221538923,
     'f': 0.8750095209079137},
    'gp

In [26]:
def get_metric_per_epoch(scores, score_metric):
    scores_list = []
    for item in scores:
        epoch_score ={"epoch":item["iter"],
                     "train_score":item["train_score"][f"ents_{score_metric}"],
                     "val_score":item["val_score"][f"ents_{score_metric}"]}

        scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores
    

In [27]:
def get_metric_per_entity(scores, score_metric):
    # Create a function to transform scores into dataframe
    
    scores_list = []
    for item in scores:
        epoch =item["iter"]
        
        for ent in item["val_score"]['ents_per_type'].keys():
            epoch_score ={
                "epoch":epoch,
                "ent":ent,
                "train_score":item["train_score"]['ents_per_type'][ent][score_metric],
                "val_score":item["val_score"]['ents_per_type'][ent][score_metric]}
            scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores       
                

In [28]:
df_scores = get_metric_per_entity(scores, "f")

In [29]:
df_scores

Unnamed: 0,epoch,ent,train_score,val_score
0,0,geo,0.858735,0.849374
1,0,gpe,0.935703,0.930609
2,0,tim,0.87501,0.866644
3,0,nat,0.0,0.0
4,1,geo,0.874038,0.862216
5,1,gpe,0.949091,0.94248
6,1,tim,0.888872,0.868748
7,1,nat,0.376344,0.333333
8,2,geo,0.888577,0.863668
9,2,gpe,0.953407,0.946072


In [30]:
data=[]
for score in ["train_score", "val_score"]:
    
    trace=go.Scatter(
                x=df_scores.epoch,
                y=df_scores[score],
                mode='lines',
                marker=dict(
                size=5
                ),
            name=score,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title=f"<b>Performance by epoch",
    
    ))
iplot(figure)

In [31]:
data=[]
for ent in df_scores.ent.unique():
    df_plot = df_scores.loc[df_scores.ent == ent]
    
    trace=go.Scatter(
                x=df_plot.epoch,
                y=df_plot.val_score,
                mode='lines',
                marker=dict(
                size=5
                ),
            name=ent,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title=f"<b>Performance by epoch and entity type",
    
    ))
iplot(figure)

# Make predictions

In [1]:
text = '''Donald Trump mistook E Jean Carroll, the writer who accuses him of rape, for his ex-wife Marla Maples during a deposition in the case last year, excerpts released in US district court on Wednesday showed.

“That’s Marla, yeah,” Trump said, when shown a photograph. “That’s my wife.”

The mistake was corrected by a lawyer for the 76-year-old former president. But observers said it could undermine Trump’s claim he could not have attacked Carroll because she is not his “type”.

It was not the first release of excerpts from Trump’s deposition, which happened in October. Last week, Trump was shown to have claimed Carroll “said it was very sexy to be raped”.

Carroll says Trump raped her in a department store changing room in the mid-1990s. Trump denies it.

Carroll sued Trump for defamation and under the Adult Survivors Act, a New York law which allows alleged victims of historical sexual assault to bring cases within a defined timeframe.

Trump was married to Maples, the mother of his daughter Tiffany, from 1993 to 1999, between marriages to Ivana Trump, his first wife, and Melania Trump, his third and current spouse.

The photograph he thought showed Maples shows Trump in Carroll’s company in the 1990s. In his deposition, Trump said it showed a “receiving line” at an event.'''

In [33]:
doc = nlp(text)    
for ent in doc.ents:
    print(f"{ent.label_} : {ent.text}")

geo : US
tim : Wednesday
tim : October
geo : New York
tim : from 1993 to 1999
tim : 1990s


In [34]:
colmap = {'geo': '\033[93m\033[1m',
 'tim': '\033[95m\033[1m',
 'gpe': '\033[92m\033[1m',
 'nat': '\033[96m\033[1m',
 'normal': '\033[0m'}

def highlight_entities(text, ents):
    blue_bold_char = '\033[94m\033[1m'
    back_to_normal = '\033[0m'

    previous_end=0
    text_h =""
    for ent in ents["entities"]:

        start=ent[0]
        end=ent[1]
        ent_val = text[start:end]
        try:
            text_h = text_h + text[previous_end:start]+colmap[ent[2]] +ent_val+back_to_normal
        except:
            text_h = text_h + text[previous_end:start]+blue_bold_char +ent_val+back_to_normal
        previous_end = end
    text_h = text_h+text[previous_end:]
    print(text_h)



In [35]:
def text_to_entities(text, nlp):
    print(colmap['geo'] + 'GEOGRAPHICAL ENTITIY' + colmap['normal'], '||',
        colmap['tim'] + 'TIME INDICATOR' + colmap['normal'],'||',
        colmap['gpe'] + 'GEOPOLITICAL ENTITIY' + colmap['normal'],'||',
        colmap['nat'] + 'NATURAL PHENOMENON' + colmap['normal'])
    print('')
    doc = nlp(text)   
    ents = doc.ents
    entites = []
    for ent in ents:
        entites.append((ent.start_char, ent.end_char, ent.label_))
        
    output = (text, {"entities":entites})
    highlight_entities(output[0], output[1])
    return output

In [37]:
text_to_entities(text, nlp)[0][0]

[93m[1mGEOGRAPHICAL ENTITIY[0m || [95m[1mTIME INDICATOR[0m || [92m[1mGEOPOLITICAL ENTITIY[0m || [96m[1mNATURAL PHENOMENON[0m

Donald Trump mistook E Jean Carroll, the writer who accuses him of rape, for his ex-wife Marla Maples during a deposition in the case last year, excerpts released in [93m[1mUS[0m district court on [95m[1mWednesday[0m showed.

“That’s Marla, yeah,” Trump said, when shown a photograph. “That’s my wife.”

The mistake was corrected by a lawyer for the 76-year-old former president. But observers said it could undermine Trump’s claim he could not have attacked Carroll because she is not his “type”.

It was not the first release of excerpts from Trump’s deposition, which happened in [95m[1mOctober[0m. Last week, Trump was shown to have claimed Carroll “said it was very sexy to be raped”.

Carroll says Trump raped her in a department store changing room in the mid-1990s. Trump denies it.

Carroll sued Trump for defamation and under the Adult Survivo

'D'

In [38]:
text_2 = '''Germany is facing a backlash from allies over its reluctance to supply Leopard 2 tanks to bolster Ukraine’s fighting capacity in 
the nearly year-long war with Russia.On Friday, 50 countries agreed to provide Kyiv with billions of dollars’ worth of military hardware, 
including armoured vehicles and munitions needed to push back Russian forces.But the German defence minister, Boris Pistorius, told reporters 
t the US Ramstein airbase in Germany that despite heightened expectations, “we still cannot say when a decision will be taken, and what the 
decision will be, when it comes to the Leopard tank”.Ukraine on Saturday denounced the “global indecision” of its allies in providing 
heavy-duty modern tanks, saying “today’s indecision is killing more of our people”.“Every day of delay is the death of Ukrainians. 
Think faster,” tweeted presidential adviser Mykhailo Podolyak.Several allies echoed the Ukrainian president, Volodymyr Zelenskiy, in saying
 the tanks were essential to Ukraine’s fight with its much larger neighbour.In a joint statement – and a rare public criticism of Europe’s 
 top power – the foreign ministers of the three Baltic states of Latvia, Estonia and Lithuania said they “call on Germany to provide Leopard
   tanks to Ukraine now”.“This is needed to stop Russian aggression, help Ukraine and restore peace in Europe quickly. Germany as the leading 
   European power has special responsibility in this regard,” said the statement, tweeted by the Latvian foreign minister, Edgars Rinkēvičs.
   Berlin has been hesitant to send the Leopards or allow other countries to transfer them to Kyiv, with reports earlier in the week saying 
   it would agree to do so only if the US provided its tanks as well. Washington has said providing its Abrams tanks to Ukraine is not 
   feasible, citing difficulties in training and maintenance.But expectations had grown ahead of Friday’s Ukraine contact group meeting of 
   about 50 US-led countries at Ramstein airbase that Germany would at least agree to let other countries operating Leopards transfer them 
   to Kyiv’s army.The US senator Lindsey Graham, a Republican from South Carolina who is visiting Kyiv, called on both sides to supply the 
   machines.“To the Germans: send tanks to Ukraine because they need them. It is in your own national interest that Putin loses in Ukraine.
   “To the Biden administration: send American tanks so that others will follow our lead,” he tweeted.The pleas came as the Russian army said
     its troops had launched an offensive in Ukraine’s Zaporizhzhia region, where fighting intensified this week after several months of an 
     almost frozen front.In its daily report on Saturday, Moscow’s forces said they had carried out “offensive operations” in the region and
       claimed to have “taken more advantageous lines and positions”.'''

In [39]:
text_to_entities(text_2, nlp)[0][0]

[93m[1mGEOGRAPHICAL ENTITIY[0m || [95m[1mTIME INDICATOR[0m || [92m[1mGEOPOLITICAL ENTITIY[0m || [96m[1mNATURAL PHENOMENON[0m

[93m[1mGermany[0m is facing a backlash from allies over its reluctance to supply Leopard 2 tanks to bolster Ukraine’s fighting capacity in 
the nearly [95m[1myear-long[0m war with [93m[1mRussia[0m.On [95m[1mFriday[0m, 50 countries agreed to provide Kyiv with billions of dollars’ worth of military hardware, 
including armoured vehicles and munitions needed to push back [92m[1mRussian[0m forces.But the [92m[1mGerman[0m defence minister, Boris Pistorius, told reporters 
t the [93m[1mUS[0m Ramstein airbase in [93m[1mGermany[0m that despite heightened expectations, “we still cannot say when a decision will be taken, and what the 
decision will be, when it comes to the Leopard tank”.[93m[1mUkraine[0m on [95m[1mSaturday[0m denounced the “global indecision” of its allies in providing 
heavy-duty modern tanks, saying “today’s ind

'G'

In [None]:
def save_spacy_model(nlp, model_path):
    nlp.to_disk(f'{model_path}')
    print(f"Saved model to {model_path}")
    bytes_data = nlp.to_bytes()

    f = open(f'{model_path}/bytes_data.bin', 'wb')
    f.write(bytes_data)
    f.close()
    print(f"Saved bytes_data to f'{model_path}/bytes_data.bin'")

In [None]:
#save_spacy_model(nlp, "ner_per_and_org_3_do_05_e_10_bs_32_md")

</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>
</br>


---
# Appendix

In [None]:
obj = text_to_entities(text, nlp)

In [None]:
rang = len(obj[1]['entities'])
listent = []
for i in range(rang):
    listent.append(obj[1]['entities'][i][2])

listent = pd.unique(listent)

len(listent)

# colmap = {'PERSON': '\x1b[95m\x1b[1m',
#  'DATE': '\x1b[96m\x1b[1m',
#  'GPE': '\x1b[36m\x1b[1m',
#  'ORG': '\x1b[94m\x1b[1m',
#  'ORDINAL': '\x1b[92m\x1b[1m',
#  'LAW': '\x1b[93m\x1b[1m',
#  'NORP': '\x1b[91m\x1b[1m',
#  'PRODUCT': '\x1b[90m\x1b[1m'}

# coldi = {'PURPLE' : '\033[95m\033[1m',
#    'CYAN' : '\033[96m\033[1m',
#    'DARKCYAN' : '\033[36m\033[1m',
#    'BLUE' : '\033[94m\033[1m',
#    'GREEN' : '\033[92m\033[1m',
#    'YELLOW' : '\033[93m\033[1m',
#    'RED' : '\033[91m\033[1m',
#    'GRAY' : '\033[90m\033[1m',
# }
# colors = list(coldi.values())


# colmap = dict(zip(listent, colors))
# colmap

In [66]:
colmap = {'geo': '\x1b[95m\x1b[1m',
 'tim': '\x1b[96m\x1b[1m',
 'gpe': '\x1b[36m\x1b[1m',
 'nat': '\x1b[94m\x1b[1m'
 }

def highlight_entities(text, ents):
    blue_bold_char = '\033[94m\033[1m'
    back_to_normal = '\033[0m'

    previous_end=0
    text_h =""
    for ent in ents["entities"]:

        start=ent[0]
        end=ent[1]
        ent_val = text[start:end]
        try:
            text_h = text_h + text[previous_end:start]+colmap[ent[2]] +ent_val+back_to_normal
        except:
            text_h = text_h + text[previous_end:start]+blue_bold_char +ent_val+back_to_normal
        previous_end = end
    text_h = text_h+text[previous_end:]
    print(text_h)

In [67]:
def text_to_entities(text, nlp):
    print()
    doc = nlp(text)   
    ents = doc.ents
    entites = []
    for ent in ents:
        entites.append((ent.start_char, ent.end_char, ent.label_))
        
    output = (text, {"entities":entites})
    highlight_entities(output[0], output[1])
    return output