In [2]:
import spacy
import random
import time
import numpy as np
import sys
from spacy import displacy # I used spacy v3 which introduce the transformers
from itertools import chain
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator

In [3]:
def convert_to_spacy_format(file_pth):
    """
    Converts data 
    
    From:
        word    label
        word    label
    
    To:
        [sentence, {entities: [(i, j, label), (i, j, label)]}, ...other sentences]
    
    """
    
    # Open the file.
    file = open(file_pth, 'r')
    
    # defing helper variables
    tr_data, entities, sentence, unique_labels = [], [], [], []
    
    # annotations
    current_annotation = None
    
    # character trackers
    i = 0
    j = 0
    
    # Loop over the dataset
    for line in file:
        # splits with the tab delimitor, removes the end of line characters 
        line = line.strip("\n").split("\t")

        # the current line is a word not a character or white space.
        if len(line) > 1:
            # retrieve the label(the second field of the tsv file)
            label = line[1]
            
            # ignore outside entity
            if(label != 'O'):
                label = line[1]+"_Disease"  
            word = line[0]
            sentence.append(word)
            i = j
            j += (len(word) + 1)

            if label == 'I_Disease' or label == 'B_Disease' :
                entities.append(( i, j-1, label))

            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)

        # when we reach the end of sentence, whitespaces token between the sentences
        if len(line) == 1:
            # if actually grapped some of the necessary entities
            if(len(entities) > 0):
                # join the word along the a space
                sentence = " ".join(sentence)
                
                # append to the dataset
                tr_data.append([sentence, {'entities' : entities}])

            # Resetting
            j = 0 
            i = 0
            entities, sentence = [], []
 
    file.close()
    return tr_data, unique_labels   

In [11]:
convert_to_spacy_format('./Data/train.tsv')[0][:2] # first example

[["Selegiline - induced postural hypotension in Parkinson ' s disease : a longitudinal study on the effects of drug withdrawal .",
  {'entities': [(21, 29, 'B_Disease'),
    (30, 41, 'I_Disease'),
    (45, 54, 'B_Disease'),
    (55, 56, 'I_Disease'),
    (57, 58, 'I_Disease'),
    (59, 66, 'I_Disease')]}],
 ["OBJECTIVES : The United Kingdom Parkinson ' s Disease Research Group ( UKPDRG ) trial found an increased mortality in patients with Parkinson ' s disease ( PD ) randomized to receive 10 mg selegiline per day and L - dopa compared with those taking L - dopa alone .",
  {'entities': [(32, 41, 'B_Disease'),
    (42, 43, 'I_Disease'),
    (44, 45, 'I_Disease'),
    (46, 53, 'I_Disease'),
    (132, 141, 'B_Disease'),
    (142, 143, 'I_Disease'),
    (144, 145, 'I_Disease'),
    (146, 153, 'I_Disease'),
    (156, 158, 'B_Disease')]}]]

In [165]:
train_data, labels = convert_to_spacy_format("./Data/train.tsv")
test_data, _ = convert_to_spacy_format("./Data/test.tsv")
VALID_DATA, _ = convert_to_spacy_format("./Data/train_dev.tsv")

In [167]:
# From : https://stackoverflow.com/questions/67407433/using-spacy-3-0-to-convert-data-from-old-spacy-v2-format-to-the-brand-new-spacy
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

for text, annot in tqdm(train_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

db = DocBin()
for text, annot in tqdm(VALID_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./valid.spacy") # save the docbin object

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2658/2658 [00:03<00:00, 874.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5385/5385 [00:02<00:00, 2614.59it/s]


In [169]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [170]:
!python -m spacy train config.cfg --verbose --output ./bank_statement/training/ --paths.train train.spacy --paths.dev valid.spacy

[+] Created output directory: bank_statement\training
[i] Saving to output directory: bank_statement\training
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     39.60    0.00    0.00    0.00    0.00
  0     200         80.48   3126.09   61.02   67.70   55.55    0.61
  0     400        107.43   2113.01   70.86   77.51   65.27    0.71
  1     600        127.23   2180.66   76.82   84.14   70.67    0.77
  1     800        126.76   1548.40   79.83   81.41   78.31    0.80
  2    1000        168.57   1888.75   82.18   88.03   77.07    0.82
  2    1200        187.94   1404.91   84.35   87.47   81.45    0.84
  3    1400        276.97   1149.96   85.46   87.80   83.23    0.85
  5    1600        253.21    964.59   85.24   85.95   84.53    0.85
  6    1800        335.91    8

[2022-03-08 23:01:39,302] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2022-03-08 23:01:42,712] [INFO] Set up nlp object from config
[2022-03-08 23:01:42,728] [DEBUG] Loading corpus from path: valid.spacy
[2022-03-08 23:01:42,728] [DEBUG] Loading corpus from path: train.spacy
[2022-03-08 23:01:42,728] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-08 23:01:42,728] [INFO] Created vocabulary
[2022-03-08 23:01:42,728] [INFO] Finished initializing nlp object
[2022-03-08 23:01:45,351] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[2022-03-08 23:01:45,362] [DEBUG] Loading corpus from path: valid.spacy
[2022-03-08 23:01:45,363] [DEBUG] Loading corpus from path: train.spacy


[+] Saved pipeline to output directory
bank_statement\training\model-last


In [198]:
ner = spacy.load(R"bank_statement/training/model-best") #load the best model

test_sentences = [x[0] for x in test_data[-10: -6]]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        displacy.render(doc, jupyter=True, style = "ent")
