In [1]:
import argparse
import spacy
import random
import warnings
from spacy.util import minibatch, compounding

In [3]:
tr_file = "rent_training.txt"
et_file = "rent_entities.txt"

In [2]:
def get_training_data(training_file,entity_file):
    f1 = open(training_file,"r",errors="ignore")
    f2 = open(entity_file,"r",errors="ignore")
    train_data = []
    for line1, line2 in zip(f1,f2):
        line1 = line1.strip()#.lower()
        line2 = line2.strip().split(",")
        ent_tuple = (int(line2[0]),int(line2[1]),line2[2])
        train_data.append(
            (line1,{"entities":[ent_tuple]})
        )
    return train_data

In [4]:
data = get_training_data(tr_file,et_file)

In [5]:
data

[('Rs 15000.00 per month (Rupees Twelve Thousand only) towards Rent.',
  {'entities': [(0, 11, 'RENT')]}),
 ('The agreed rent of Rs.2500/- has to be paid on or before 5th of every month.',
  {'entities': [(19, 28, 'RENT')]}),
 ('The Lessee shall pay the Lessor a monthly rent of Rs.11500/- (Eleven Thousand Five Hundred Only) regularly and punctually without any delay or default on or before 5th day of every month in the English Calendar and the receipt of the same to be obtained.',
  {'entities': [(50, 60, 'RENT')]}),
 ('MONTHLY RENT: The TENANT has agreed to pay a RENT of Rs. 9000/- (Rupees Nine Thousand only) per month to be paid on or before 5th day of the succeeding English calendar month.',
  {'entities': [(8, 63, 'RENT')]}),
 ('The Tenant agrees to pay a monthly rent of Rs. 4200/- (Rupees Four Thousand',
  {'entities': [(43, 53, 'RENT')]}),
 ('The rentals will be charged @ Rs.1200/- per unit per month payable monthly in advance.',
  {'entities': [(30, 39, 'RENT')]}),
 ('The lessor

In [6]:
nlp = spacy.blank("en")
print(nlp.pipe_names)
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
print(nlp.pipe_names)

[]
['ner']


In [8]:
for _, annotations in data: print(annotations)

{'entities': [(0, 11, 'RENT')]}
{'entities': [(19, 28, 'RENT')]}
{'entities': [(50, 60, 'RENT')]}
{'entities': [(8, 63, 'RENT')]}
{'entities': [(43, 53, 'RENT')]}
{'entities': [(30, 39, 'RENT')]}
{'entities': [(76, 85, 'RENT')]}
{'entities': [(36, 34, 'RENT')]}


In [None]:
def train_ner(training_file,entity_file):
    TRAIN_DATA = get_training_data(training_file,entity_file)

    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        nlp.begin_training()
        for itn in range(501):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            if itn%100 == 0: print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    nlp.to_disk("model_"+training_file.split(".")[0].split("_")[0])