In [0]:
# Import Libraries
from __future__ import unicode_literals, print_function
import numpy as np
import pandas as pd
import random
from pathlib import Path
import spacy
import re
import time
import io

In [0]:
# Trim leading and trailing whitespaces from entity spans
def trim_entity_spans(data: list) -> list:

    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [0]:
# training data
dataset = pd.read_json('3together.json1', lines=True)
dataset.drop(['id', 'meta', 'annotation_approver'],axis=1, inplace=True)
TRAINING_DATA = [([i for i in row][1],{"entities":[i for i in row][2]}) for row in dataset.itertuples()]
TRAIN_DATA = trim_entity_spans(TRAINING_DATA)

In [0]:
text = "Dear Customer,  Rs.130.00 has been debited from account **5233 to VPA hassan.aarzoo@okaxis on 28-09-18. Your UPI transaction reference number is 827102627403.  Please call on 18002586161 to report if this transaction was not authorized by you.  Warm Regards, HDFC Bank      © 2017 HDFC Bank "

In [0]:
def train_test_savemodel(data= TRAIN_DATA, test_text = text, model=None, new_model_name='mymodel', output_dir='.', n_iter=60):
    
    #Set up the pipeline and entity recognizer, and train the new entity
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    # add labels from the train data
    for _, annotations in data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(data)
            losses = {}
            for text, annotations in data:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(itn, losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print('Text for test :', test_text)
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        for ent in doc.ents:
            print(ent.label_, ":", ent.text)

In [10]:

start_time = time.time()
train_test_savemodel()
end_time = time.time()
print('Training time:', end_time - start_time)

Created blank 'en' model
0 {'ner': 19736.490652956243}
1 {'ner': 7913.531225327345}
2 {'ner': 6640.626577654645}
3 {'ner': 4469.113045807577}
4 {'ner': 3049.5371261772866}
5 {'ner': 3551.341497160411}
6 {'ner': 3000.5605724880243}
7 {'ner': 2854.5736278188983}
8 {'ner': 3224.5998145490557}
9 {'ner': 2187.8308026706627}
10 {'ner': 2492.675312568222}
11 {'ner': 2313.9559766032403}
12 {'ner': 1892.1169623544731}
13 {'ner': 2036.6095838390513}
14 {'ner': 2468.936559394968}
15 {'ner': 1778.7630220115816}
16 {'ner': 1692.15819556735}
17 {'ner': 1973.0176759750925}
18 {'ner': 1749.280993386296}
19 {'ner': 1853.9024959481503}
20 {'ner': 1766.4012652171032}
21 {'ner': 1847.4176506071303}
22 {'ner': 1666.3959526908684}
23 {'ner': 1568.6298757475258}
24 {'ner': 1507.0252487297273}
25 {'ner': 1625.7712751491588}
26 {'ner': 1565.352261851297}
27 {'ner': 1754.8312343396155}
28 {'ner': 1354.9510339709084}
29 {'ner': 1582.17293423423}
30 {'ner': 1528.8834936134833}
31 {'ner': 1363.7669519291449}
32 {'

In [11]:

mail_list = []
tr_amount = []
tr_date = []
file_name = r'updated_text.txt'
output_dir = '.'
with io.open(file_name, 'r', encoding='windows-1252') as file:
    for line in file:
      print('Text for test :', line)
      print("Loading from", output_dir)
      nlp2 = spacy.load(output_dir)
      doc = nlp2(line)
      for ent in doc.ents:
        print(ent.label_, ":", ent.text)
        if ent.label_ == 'Transaction amount':
          tr_amount.append(ent.text)
        if ent.label_ == 'Transaction date':
          tr_date.append(ent.text)
      mail_list.append(line)

Text for test : Total: Rs 99  Fri, Nov 30, 2018    Thanks for ordering, marco    Here's your receipt for Taste of Kolkata.    Total Rs 99    1  Kolkata Chicken Biryani   Rs 89    Subtotal Rs 89    Tax Rs 0   Delivery Fee Rs 10    Amount Charged      Paid in cash   |  Switch  Rs 99   Visit the trip page for more information, including invoices (where available)    Download PDF  Download link expires 12/30/18    xide8f41479-ad63-5624-be30-a7ecd28203f3  pGvlI2ANUbXFfyEOgxta1RMV082993    You ordered from Taste of Kolkata    Picked up from  15th Cross Rd, Gollahalli, Mahadeshwara Nagar, Stage 2, BTM 2nd Stage, Bengaluru, Karnataka 560076, India    Delivered to  13/1, 3rd Main Rd, Madiwala, 1st Stage, BTM Layout, Bengaluru, Karnataka 560068, India    Delivered by Rajesh   contact support a  contact support a  My Orders a    FAQ Forgot password   Uber B.V. Mr. Treublaan 7 1097 DP Amsterdam  Privacy Terms   Fares are inclusive of GST. Please download the tax invoice from the trip detail page f