## Train Named Entity Recognition model with SpaCy
This project shows how to extract information from text documents using transfer learning with pretrained model from SpaCy library.


In [90]:
from sympy import false
! pip install spacy
! pip install mlflow
! pip install scikit-learn




In [91]:
# import libraries
from spacy.util import filter_spans
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.spacy
import subprocess

# set the experiment id
# mlflow.end_run()
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.start_run()

<ActiveRun: >

In [92]:
with open('data/bitcoin_tweets_annotated.json', 'r') as f:
    data = json.load(f)
    
print(data[0])

{'id': 12887, 'text': 'Blue Ridge Bank shares halted by NYSE after #bitcoin ATM announcement https://t.co/xaaZmaJKiV @MyBlueRidgeBank… https://t.co/sgBxMkP1SI', 'label': [[0, 15, 'ORG'], [44, 52, 'CRYPTO']], 'Comments': []}


### Prepare training data

In [93]:
training_data = {
    'classes' : ['CRYPTO_NAME', "CRYPTO_PRICE", "URL"],
    'annotations' : []
}

for example in data:
  data_row = {}
  data_row['text'] = example['text']
  data_row['entities'] = []

  for annotation in example['label']:
    start = annotation[0]
    end = annotation[1]
    label = annotation[2]
    data_row['entities'].append((start, end, label))
  training_data['annotations'].append(data_row)
  
print(training_data['annotations'][1])

{'text': '😎 Today, that\'s this #Thursday, we will do a "🎬 Take 2" with our friend @LeoWandersleb, #Btc #wallet #security expe… https://t.co/go6aDgRml5', 'entities': [(90, 94, 'CRYPTO')]}


In [94]:
nlp = spacy.blank("en")

def createDocBin(data: list)->DocBin:
    doc_bin = DocBin()
    for training_row  in tqdm(data):
        text = training_row['text']
        labels = training_row['entities']
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)
    return doc_bin

train, test = train_test_split(training_data['annotations'], test_size=0.2)

doc_bin_train = createDocBin(train)
doc_bin_test = createDocBin(test)
doc_bin_train.to_disk("train_data.spacy")
doc_bin_test.to_disk("test_data.spacy")

100%|██████████| 40/40 [00:00<00:00, 2387.81it/s]
100%|██████████| 10/10 [00:00<00:00, 3015.75it/s]


### Run commands for training model

In [95]:
!python -m spacy init fill-config base_config.cfg config.cfg

# Log the configuration file
mlflow.log_artifact("config.cfg")

# Train the model
train_command = "python -m spacy train config.cfg --output ./ --paths.train ./train_data.spacy --paths.dev ./test_data.spacy"
process = subprocess.Popen(train_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Capture training output for metrics
stdout, stderr = process.communicate()

if process.returncode != 0:
    print("Training failed:", stderr.decode())
else:
    print("Training completed successfully.")


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
Training completed successfully.


In [96]:
# Log metrics from training (example: training loss)
afterLoss = False
lineNo = 0
# capture metrics from output like below
# E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE
# 0       0          0.00     48.50    0.00    0.00    0.00    0.00
# 202    1200         77.08     22.51   70.37   73.08   67.86    0.70
# 269    1400         23.71      6.65   78.57   78.57   78.57    0.79
# 344    1600         23.57      3.46   84.62   91.67   78.57    0.85
for line in stdout.decode().split('\n'):
    if "LOSS" not in line and not afterLoss:
        continue
    afterLoss = True
    lineNo += 1
    if lineNo <= 2:
        continue
    if "Saved pipeline" in line:
        break
    print(line)
    values = line.split()
    step = lineNo - 2
    mlflow.log_metric("LOSS_TOK2VEC", float(values[2]), step=step)
    mlflow.log_metric("LOSS_NER", float(values[3]), step=step)
    mlflow.log_metric("SCORE", float(values[7]), step=step)

  0       0          0.00     38.71    0.00    0.00    0.00    0.00
 18     200        146.04   1714.63   68.75   75.86   62.86    0.69
 42     400          2.24      1.72   68.66   71.88   65.71    0.69
 72     600         12.23      2.25   70.77   76.67   65.71    0.71
108     800          4.77      1.87   71.88   79.31   65.71    0.72
152    1000          0.00      0.00   68.75   75.86   62.86    0.69
204    1200         67.36     23.81   72.73   77.42   68.57    0.73
270    1400        220.38     52.86   67.69   73.33   62.86    0.68
347    1600         47.54      8.74   69.57   70.59   68.57    0.70
447    1800          0.68      0.13   68.57   68.57   68.57    0.69
547    2000         13.24      2.59   66.67   67.65   65.71    0.67
682    2200          0.09      0.02   68.66   71.88   65.71    0.69
882    2400        101.94     18.04   68.66   71.88   65.71    0.69
1082    2600        298.31     16.43   70.77   76.67   65.71    0.71
1282    2800          2.29      1.30   72.73   

In [97]:
# Log the final trained model
mlflow.spacy.log_model(spacy_model=nlp, artifact_path="spacy_model")
mlflow.end_run()



🏃 View run likeable-finch-997 at: http://localhost:5000/#/experiments/0/runs/cfe61a14c46d4196a5f08918304104ed
🧪 View experiment at: http://localhost:5000/#/experiments/0


### Test model

In [98]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("#BTC still trading at Price down: 37082.1 € this morning. @Tesla no longer invest in #Bitcoin  https://t.co/1XNq01CaMn")

colors = {"PRICE": "#F67DE3", "CRYPTO": "#7DF6D9", "ORG":"#7156F6"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)