__Training with SpaCy__

In [None]:
import spacy
from spacy.tokens import DocBin
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split

__Creating training data__

In [None]:
def get_spacy_doc(file, data):
    nlp = spacy.blank('en')
    docbin = DocBin()

    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        annot = annot['entities']
        ents = []
        entity_indices = []
        for start, end, label in  annot:
            skip_entity =  False
            for i in range(start, end):
                if i in entity_indices:
                    skip_entity = True
                    break
            if skip_entity == True:
                continue

            entity_indices += list(range(start, end))

            try: 
                span = doc.char_span(start, 
                                     end, 
                                     label = label, 
                                     alignment_mode = 'stict')
            except:
                continue

            if span == None:
                err_data = str([start, end]) + '    ' + str(text) + '\n'
                file.write(err_data)
            else:
                ents.append(span)

        try:
            doc.ents = ents 
            docbin.add(doc)
        except:
            pass
    return docbin

In [None]:
json_path = './data/training/dataset.json'
dataset = json.load(open(json_path, 'r'))
print(len(dataset))

In [None]:
train, test = train_test_split(dataset, test_size = 0.1)
print(f'Train set:" {len(train)}')
print(f'Test set:" {len(test)}')

In [None]:
file_path = open('./data/training/train_file.txt')
docbin = get_spacy_doc(file_path, train)
docbin.to_disk('./data/training/train_data.spacy')

docbin = get_spacy_doc(file_path, test)
docbin.to_disk('./data/training/test_data.spacy')