In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import spacy
import random
from tqdm import tqdm

In [2]:
import math

In [3]:
import plotly.graph_objects as go
from plotly.offline import iplot

In [4]:
from spacy.training import Example

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
with open('../data/GMB_data_spacy.pickle', 'rb') as f:
     spacy_data = pickle.load(f)

In [7]:
spacy_data[0]

('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
 {'entities': [(48, 54, 'geo'), (77, 81, 'geo'), (111, 118, 'gpe')]})

In [8]:
text = spacy_data[0][0]

In [9]:
ents = spacy_data[0][1]

In [10]:
ents 

{'entities': [(48, 54, 'geo'), (77, 81, 'geo'), (111, 118, 'gpe')]}

In [11]:
def highlight_entities(text, ents):
    blue_bold_char = '\033[94m\033[1m'
    back_to_normal = '\033[0m'

    previous_end=0
    text_h =""
    for ent in ents["entities"]:

        start=ent[0]
        end=ent[1]
        ent_val = text[start:end]
        text_h = text_h + text[previous_end:start]+blue_bold_char +ent_val+back_to_normal 
        previous_end = end
    text_h = text_h+text[previous_end:]
    print(text_h)


    

In [12]:
text

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [14]:
highlight_entities(spacy_data[0][0], spacy_data[0][0])

TypeError: string indices must be integers

# Spacy intro

In [15]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")

In [16]:
doc = nlp(text)

In [17]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']


In [18]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


# Preparing Spacy format

In [19]:
with open('../data/GMB_data_sagemaker.pickle', 'rb') as f:
     sagemaker_data = pickle.load(f)

In [20]:
## Current format
sagemaker_data[0]

{'workerId': 'some-random-worker-no-123',
 'dataObject': {'content': 'Mr. Egeland said the latest figures show 1.8 million people are in need of food assistance - with the need greatest in Indonesia , Sri Lanka , the Maldives and India .'},
 'annotationData': {'content': {'entities': [{'endOffset': 11,
     'label': 'per',
     'startOffset': 0},
    {'endOffset': 128, 'label': 'tim', 'startOffset': 119},
    {'endOffset': 134, 'label': 'per', 'startOffset': 131},
    {'endOffset': 140, 'label': 'gpe', 'startOffset': 135},
    {'endOffset': 155, 'label': 'geo', 'startOffset': 147},
    {'endOffset': 165, 'label': 'geo', 'startOffset': 160}]}}}

In [21]:
## Desired format
('Mr. Egeland said the latest figures show 1.8 million people are in need of food assistance - with the need greatest in Indonesia , Sri Lanka , the Maldives and India .',
 {'entities': [(0, 11, 'per'),
   (119, 128, 'tim'),
   (131, 134, 'per'),
   (135, 140, 'gpe'),
   (147, 155, 'geo'),
   (160, 165, 'geo')]})

('Mr. Egeland said the latest figures show 1.8 million people are in need of food assistance - with the need greatest in Indonesia , Sri Lanka , the Maldives and India .',
 {'entities': [(0, 11, 'per'),
   (119, 128, 'tim'),
   (131, 134, 'per'),
   (135, 140, 'gpe'),
   (147, 155, 'geo'),
   (160, 165, 'geo')]})

In [22]:
def sagemaker_to_spacy(item):
    ## Create function, to convert data from sagemaker format to desired spacy format
    
    text = item['dataObject']['content']
    sm_ents = item['annotationData']["content"]["entities"]
    spacy_ents = []
    for ent in sm_ents:
        ent_start = ent['startOffset']
        ent_end = ent['endOffset']
        ent_label = ent["label"]
        ent_tuple = (ent_start, ent_end, ent_label)
        
        spacy_ents.append(ent_tuple)
        
    spacy_item = (text, {"entities":spacy_ents})
    
    return spacy_item
    
    

In [23]:
spacy_item = sagemaker_to_spacy(sagemaker_data[0])

In [24]:
# Validate if transformed data works correctly with highlight_entities
highlight_entities(spacy_item[0], spacy_item[1])

[94m[1mMr. Egeland[0m said the latest figures show 1.8 million people are in need of food assistance - with the need greatest in [94m[1mIndonesia[0m , [94m[1mSri[0m [94m[1mLanka[0m , the [94m[1mMaldives[0m and [94m[1mIndia[0m .


## Usefull functions when evaluating NER annotations

In [25]:
def find_word_positions(word, text):
    word_occurences = [m.start() for m in re.finditer(word, text)]

In [26]:
text = spacy_item[0]
word = "the"

In [27]:
word_occurences = [m.start() for m in re.finditer("the", spacy_item[0])]

In [28]:
word_occurences

[17, 98, 143]

In [29]:
for word_occurence in  word_occurences:
    print(text[word_occurence:word_occurence+len(word)])

the
the
the


In [30]:
def replace_multiple(text, replace_tuples):
    # mixed numbers and currency inside a word don't work well with tokenizer
    for r in (replace_tuples):
        text = text.replace(*r)
        
    return text.strip()

In [34]:
replace_tuples = (("EURO","euro"), ("eur ","euro "), ("€","euro"))

In [35]:
text_euro = "In written text currency can be written as euro, EURO, eur or € etc."



In [36]:
## How to fix euroo issue?
replace_multiple(text_euro, replace_tuples)

'In written text currency can be written as euro, euro, euro or euro etc.'

# NER Spacy data preparation

In [37]:
spacy_data

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
  {'entities': [(48, 54, 'geo'), (77, 81, 'geo'), (111, 118, 'gpe')]}),
 ('Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "',
  {'entities': [(109, 113, 'per')]}),
 ('They marched from the Houses of Parliament to a rally in Hyde Park .',
  {'entities': [(57, 66, 'geo')]}),
 ('Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .',
  {'entities': []}),
 ("The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .",
  {'entities': [(57, 64, 'geo'),
    (75, 86, 'org'),
    (103, 110, 'gpe'),
    (129, 137, 'geo')]}),
 ("The party is divided over Britain 's participation in the Iraq conflict and the continued deplo

In [38]:
def filter_ents(data, valid_ents):
    data_mod = []
    for item in data:
        text = item[0]
        ents = item[1]["entities"]
        ents_mod = []
        for ent in ents:
            label = ent[2]
            if label in valid_ents:
                ents_mod.append(ent)
        
        item_mod = (text, {"entities":ents_mod})
        data_mod.append(item_mod)
        
    return data_mod
        
data_per = filter_ents(spacy_data, ["per", "org"])
data_geo = filter_ents(spacy_data, ["geo", "gpe", "nat", "tim"])


In [39]:
with open('../data/GMB_data_spacy_per.pickle', 'rb') as f:
     spacy_data = pickle.load(f)

In [40]:
spacy_data = spacy_data[:5000]

## Load language model

In [41]:
nlp = spacy.load("en_core_web_md")
ner = nlp.create_pipe('ner')

In [42]:
# check available pipelines
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [43]:
## disable pipes other than ner
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

## Train test split

In [44]:
spacy_data_train, spacy_data_test = train_test_split(spacy_data, test_size=0.2, random_state=42)

## Convert data to Examples

In [45]:
def prepare_examples(data):
    # convert data from list tuples to list of spacy Examples
    examples = []
    for i in range(0, len(data)):
        text, entity_offsets = data[i]

        try:
            doc= nlp.make_doc(text.lower())
            example_test = Example.from_dict(doc, entity_offsets)
            examples.append(example_test)
           
        except Exception as e:
            print(e)
            pass
    return examples

In [46]:
def split_examples_to_batches(examples, batch_size):
    batches=[]
    for i in range(0, math.ceil(len(examples)/batch_size)):
        start=i*batch_size
        end = start+batch_size
        batches.append(examples[start:end])
    return batches

In [47]:
examples_train = prepare_examples(spacy_data_train)
examples_test = prepare_examples(spacy_data_test)

In [48]:
for _, annotations in spacy_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Train NER model

In [49]:
DROPOUT=0.2
epochs = 10

batch_size=20

In [50]:
def train_ner(nlp, examples_train, examples_test, epochs, batch_size, dropout):
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.create_optimizer()
        scores = []

        for i in range(0, epochs):
            random.shuffle(examples_train)

            batches = split_examples_to_batches(examples_train, batch_size)
            for batch in  tqdm(batches):
                try:
                        nlp.update(batch, sgd=optimizer, drop =DROPOUT)
                except Exception as e:
                    print(e)
                    pass
            train_score = nlp.evaluate(examples_train)
            val_score = nlp.evaluate(examples_test)
            total_f = val_score['ents_f']

            scores.append({"iter":i, "val_score":val_score, "train_score":train_score})

            print(f"Iter:{i}, f_score:{round(total_f,2)}")

    return nlp, scores

In [51]:
nlp, scores = train_ner(nlp, examples_train, examples_test, epochs, batch_size, DROPOUT)

100%|██████████| 200/200 [00:42<00:00,  4.72it/s]


Iter:0, f_score:0.6


100%|██████████| 200/200 [00:36<00:00,  5.55it/s]


Iter:1, f_score:0.64


100%|██████████| 200/200 [00:40<00:00,  4.95it/s]


Iter:2, f_score:0.66


100%|██████████| 200/200 [00:40<00:00,  4.91it/s]


Iter:3, f_score:0.63


100%|██████████| 200/200 [00:39<00:00,  5.01it/s]


Iter:4, f_score:0.68


100%|██████████| 200/200 [00:36<00:00,  5.48it/s]


Iter:5, f_score:0.66


100%|██████████| 200/200 [00:42<00:00,  4.73it/s]


Iter:6, f_score:0.67


100%|██████████| 200/200 [00:38<00:00,  5.18it/s]


Iter:7, f_score:0.66


100%|██████████| 200/200 [00:42<00:00,  4.75it/s]


Iter:8, f_score:0.67


100%|██████████| 200/200 [00:38<00:00,  5.15it/s]


Iter:9, f_score:0.66


## Evaluate training curve

In [52]:
scores

[{'iter': 0,
  'val_score': {'token_acc': 1.0,
   'token_p': 1.0,
   'token_r': 1.0,
   'token_f': 1.0,
   'ents_p': 0.6753670473083198,
   'ents_r': 0.5390625,
   'ents_f': 0.5995655322230269,
   'ents_per_type': {'per': {'p': 0.6943699731903485,
     'r': 0.7174515235457064,
     'f': 0.7057220708446867},
    'org': {'p': 0.6458333333333334,
     'r': 0.3808353808353808,
     'f': 0.47913446676970634}},
   'speed': 16566.233783105537},
  'train_score': {'token_acc': 1.0,
   'token_p': 1.0,
   'token_r': 1.0,
   'token_f': 1.0,
   'ents_p': 0.7372319688109161,
   'ents_r': 0.6037675606641124,
   'ents_f': 0.6638581709671757,
   'ents_per_type': {'org': {'p': 0.7144230769230769,
     'r': 0.44331742243436756,
     'f': 0.5471281296023565},
    'per': {'p': 0.7527868852459016,
     'r': 0.7884615384615384,
     'f': 0.7702113384770211}},
   'speed': 17157.35087380741}},
 {'iter': 1,
  'val_score': {'token_acc': 1.0,
   'token_p': 1.0,
   'token_r': 1.0,
   'token_f': 1.0,
   'ents_p': 0

In [53]:
def get_metric_per_epoch(scores, score_metric):
    # Create a function to transform scores into dataframe
    scores_list = []
    for item in scores:
        epoch_score ={"epoch":item["iter"],
                     "train_score":item["train_score"][f"ents_{score_metric}"],
                     "val_score":item["val_score"][f"ents_{score_metric}"]}

        scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores
    

In [54]:
def get_metric_per_entity(scores, score_metric):
    
    
    scores_list = []
    for item in scores:
        epoch =item["iter"]
        
        for ent in item["val_score"]['ents_per_type'].keys():
            epoch_score ={
                "epoch":epoch,
                "ent":ent,
                "train_score":item["train_score"]['ents_per_type'][ent][score_metric],
                "val_score":item["val_score"]['ents_per_type'][ent][score_metric]}
            scores_list.append(epoch_score)
    df_scores = pd.DataFrame(scores_list)
    return df_scores       
                
                   

In [55]:
df_scores = get_metric_per_entity(scores, "f")

## Plot training curves

In [56]:
df_scores = get_metric_per_epoch(scores, "f")

In [58]:
data=[]
for score in ["train_score", "val_score"]:
    
    trace=go.Scatter(
                x=df_scores.epoch,
                y=df_scores[score],
                mode='lines',
                marker=dict(
                size=5
                ),
            name=score,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title=f"<b>Performance by epoch",
    
    ))
iplot(figure)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [59]:
df_scores_per_ent = get_metric_per_entity(scores, "r")

In [61]:
data=[]
for ent in df_scores_per_ent.ent.unique():
    df_plot = df_scores_per_ent.loc[df_scores_per_ent.ent == ent]
    
    trace=go.Scatter(
                x=df_plot.epoch,
                y=df_plot.val_score,
                mode='lines',
                marker=dict(
                size=5
                ),
            name=ent,

            )
    data.append(trace)
    
figure=go.Figure(
    data=data,
    layout=go.Layout(
        title=f"<b>Performance by epoch and entity type",
    
    ))
iplot(figure)

# Make predictions

In [62]:
doc = nlp(text)    
for ent in doc.ents:
    print(f"{ent.label_} : {ent.text}")
  

per : Mr. Egeland


In [63]:
def text_to_entities(text, nlp):
    ## Create function to convert text to entities (spacy format similar to data we used so far)
    doc = ...   
    ents = ...
    entites = []
    for ent in ents:
        entites.append((..., ...., ...))
        
    output = (text, {"entities":entites})
    highlight_entities(output[0], output[1])
    return output
        

## Experiment yourself with a few news headlines

In [64]:
text = "Brazil riots: Arrests ordered for top officials after capital stormed with Tony Blair"

In [65]:
text_to_entities(text, nlp)

[94m[1mBrazil[0m riots: Arrests ordered for top officials after capital stormed with [94m[1mTony Blair[0m


('Brazil riots: Arrests ordered for top officials after capital stormed with Tony Blair',
 {'entities': [(0, 6, 'org'), (75, 85, 'per')]})

## Save NER model

In [59]:
def save_spacy_model(nlp, model_path):
    nlp.to_disk(f'{model_path}')
    print(f"Saved model to {model_path}")
    bytes_data = nlp.to_bytes()

    f = open(f'{model_path}/bytes_data.bin', 'wb')
    f.write(bytes_data)
    f.close()
    print(f"Saved bytes_data to f'{model_path}/bytes_data.bin'")

In [60]:
#save_spacy_model(nlp, "ner_per_and_org")

Saved model to ner_per_and_org
Saved bytes_data to f'ner_per_and_org/bytes_data.bin'


## Load saved model

In [61]:
def load_spacy_model(model_path, base_model = "en_core_web_md"):

    nlp = spacy.load(base_model)
    file = open(f'{model_path}/bytes_data.bin',"rb")
    bytes_data = file.read()
    config = nlp.config
    lang_cls = spacy.util.get_lang_class("en")
    nlp = lang_cls.from_config(config)
    nlp = nlp.from_disk(f'{model_path}')
    
    return nlp

In [62]:
nlp2 = load_spacy_model("ner_per_and_org")

In [70]:
text_to_entities(text, nlp2)

[94m[1mBrazil[0m riots: Arrests ordered for top officials after capital stormed with [94m[1mTony Blair[0m


('Brazil riots: Arrests ordered for top officials after capital stormed with Tony Blair',
 {'entities': [(0, 6, 'org'), (75, 85, 'per')]})