## Train Named Entity Recognition model with SpaCy
This project shows how to extract information from text documents using transfer learning with pretrained model from SpaCy library.


In [2]:
# import ibraries
from spacy.util import filter_spans
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [3]:
with open('data/bitcoin_tweets_annotated.json', 'r') as f:
    data = json.load(f)
    
print(data[0])

{'id': 4304, 'text': 'I just bought 9k/534k Coins in Harmony $ONE and $CELR Network 😊📈 #ToTheMoon #BNB\xa0 #Doge #Ada #KuCoin #XRP #BTC\xa0… https://t.co/urfWe62tav', 'Comments': [], 'label': [[84, 89, 'CRYPTO_NAME'], [90, 94, 'CRYPTO_NAME'], [94, 102, 'CRYPTO_NAME'], [103, 107, 'CRYPTO_NAME'], [108, 112, 'CRYPTO_NAME']]}


### Prepare training data

In [4]:
training_data = {
    'classes' : ['CRYPTO_NAME', "CRYPTO_PRICE", "ORGANIZATION"],
    'annotations' : []
}

for example in data:
  data_row = {}
  data_row['text'] = example['text']
  data_row['entities'] = []

  for annotation in example['label']:
    start = annotation[0]
    end = annotation[1]
    label = annotation[2]
    data_row['entities'].append((start, end, label))
  training_data['annotations'].append(data_row)
  
print(training_data['annotations'][1])

{'text': 'Blue Ridge Bank shares halted by NYSE after #bitcoin ATM announcement https://t.co/xaaZmaJKiV @MyBlueRidgeBank… https://t.co/sgBxMkP1SI', 'entities': [(0, 15, 'ORGANIZATION'), (33, 37, 'ORGANIZATION'), (44, 52, 'CRYPTO_NAME')]}


In [5]:
nlp = spacy.blank("en") 

doc_bin = DocBin()

In [6]:
for training_row  in tqdm(training_data['annotations']): 
    text = training_row['text']
    labels = training_row['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

    
doc_bin.to_disk("training.spacy")

100%|██████████| 18/18 [00:00<00:00, 2159.35it/s]


### Run commands for training model

In [15]:
!python -m spacy init fill-config base_config.cfg config.cfg

!python -m spacy train config.cfg --output ./ --paths.train ./training.spacy --paths.dev ./training.spacy

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.50    0.00    0.00    0.00    0.00
 41     200         80.97   1656.99   97.35   96.49   98.21    0.97
 91     400         84.12    182.16   97.35   96.49   98.21    0.97
154     600         80.21    174.16   98.18  100.00   96.

### Test model

In [24]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("#BTC still trading at Price: - 37082.1 € this morning. #Bitcoin  https:\/\/t.co\/1XNq01CaMn")

colors = {"CRYPTO_PRICE": "#F67DE3", "CRYPTO_NAME": "#7DF6D9", "ORGANIZATION":"#FFFFFF"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)