In [1]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

from tqdm import tqdm
import pandas as pd
import numpy as np

import pickle

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
pd.set_option('display.max_rows', 9999)

In [2]:
input_csv = '../../ner_dataset.csv'
nrows = None
input_df = pd.read_csv(input_csv, nrows = nrows,encoding = "ISO-8859-1",low_memory=False)

In [3]:
# dataset_df.head(100)

In [4]:
# dataset_df['Tag'].unique()

In [5]:
dataset_df = input_df.drop(columns=['Sentence #', 'POS'])

In [6]:
dataset_df.to_csv('ner_corpus.tsv', sep='\t', index = False)

In [7]:
import json
import logging
import sys
def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("ner_corpus.tsv",'ner_corpus.json','abc')

In [10]:
input_file = "ner_corpus.json"
output_file = "processed_dataset"

In [11]:
training_data = []
lines=[]
with open(input_file, 'r') as f:
    lines = f.readlines()

for line in lines:
    data = json.loads(line)
    text = data['content']
    entities = []
    for annotation in data['annotation']:
        point = annotation['points'][0]
        labels = annotation['label']
        if not isinstance(labels, list):
            labels = [labels]

        for label in labels:
            entities.append((point['start'], point['end'] + 1 ,label))


    training_data.append((text, {"entities" : entities}))

# print(training_data)

with open(output_file, 'wb') as fp:
    pickle.dump(training_data, fp)

In [12]:
training_data[:5]

[('Word Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country ',
  {'entities': [(0, 4, 'Tag'),
    (53, 59, 'B-geo'),
    (82, 86, 'B-geo'),
    (116, 123, 'B-gpe')]}),
 ('Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as """" Bush Number One Terrorist """" and """" Stop the Bombings ',
  {'entities': [(112, 116, 'B-per')]}),
 ('"""" They marched from the Houses of Parliament to a rally in Hyde Park ',
  {'entities': [(62, 66, 'B-geo'), (67, 71, 'I-geo')]}),
 ('Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 ',
  {'entities': []}),
 ("The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton ",
  {'entities': [(57, 64, 'B-geo'),
    (129, 137, 'B-geo'),
    (75, 80, 'B-org'),
    (81, 86, 'I-org'),
    (103, 110, 'B-gpe')]})]

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [14]:
doc = nlp(u"My name is Naitik")
print([(w.text, w.pos_) for w in doc])

[('My', 'ADJ'), ('name', 'NOUN'), ('is', 'VERB'), ('Naitik', 'ADJ')]


In [17]:
# doc[0].ner_

In [18]:
# nlp = spacy.blank('en')
# optimizer = nlp.begin_training()
# for i in tqdm(range(20)):
#     random.shuffle(training_data)
#     for text, annotations in training_data:
#         nlp.update([text], [annotations], sgd=optimizer)
# # nlp.to_disk("/model")

In [19]:
doc = nlp(u"London is in England")
print([(w.text, w.pos_) for w in doc])

[('London', ''), ('is', ''), ('in', ''), ('England', '')]


In [20]:
nlp= spacy.load("en_core_web_sm")

In [21]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [22]:
ner = nlp.get_pipe('ner')

In [23]:
# add labels
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [24]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

In [25]:
n_iter = 20

In [26]:
with nlp.disable_pipes(*other_pipes):
    for itn in tqdm(range(n_iter)):
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)


  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [09:34<3:02:00, 574.75s/it][A

Losses {'ner': 2268.6041980779205}



 10%|█         | 2/20 [20:31<2:59:46, 599.28s/it][A

Losses {'ner': 1988.5744319899613}



 15%|█▌        | 3/20 [31:18<2:53:53, 613.72s/it][A

Losses {'ner': 1982.4172289561357}



 20%|██        | 4/20 [41:47<2:44:52, 618.27s/it][A

Losses {'ner': 1957.1333465483972}



 25%|██▌       | 5/20 [52:28<2:36:17, 625.19s/it][A

Losses {'ner': 1950.7174705879002}



 30%|███       | 6/20 [1:04:09<2:31:07, 647.71s/it][A

Losses {'ner': 1942.8002620689704}



 35%|███▌      | 7/20 [1:18:44<2:35:09, 716.11s/it][A

Losses {'ner': 1949.5285265756247}



 40%|████      | 8/20 [1:34:10<2:35:46, 778.85s/it][A

Losses {'ner': 1931.42635524481}



 45%|████▌     | 9/20 [1:48:17<2:26:34, 799.53s/it][A

Losses {'ner': 1943.5496148778425}



 50%|█████     | 10/20 [2:02:38<2:16:19, 817.96s/it][A

Losses {'ner': 1943.8555093970583}



 55%|█████▌    | 11/20 [2:21:43<2:17:23, 915.96s/it][A

Losses {'ner': 1942.6862990617317}



 60%|██████    | 12/20 [2:36:49<2:01:44, 913.00s/it][A

Losses {'ner': 1920.1488884366577}



 65%|██████▌   | 13/20 [2:50:15<1:42:46, 881.00s/it][A

Losses {'ner': 1929.5339886334423}



 70%|███████   | 14/20 [3:06:35<1:31:03, 910.51s/it][A

Losses {'ner': 1929.9855643339454}



 75%|███████▌  | 15/20 [3:26:28<1:22:57, 995.47s/it][A

Losses {'ner': 1906.8587042942052}



 80%|████████  | 16/20 [3:42:25<1:05:35, 983.78s/it][A

Losses {'ner': 1932.152672894767}



 85%|████████▌ | 17/20 [3:55:00<45:45, 915.13s/it]  [A

Losses {'ner': 1915.788621468646}



 90%|█████████ | 18/20 [4:07:22<28:46, 863.30s/it][A

Losses {'ner': 1936.8314535313098}



 95%|█████████▌| 19/20 [4:19:50<13:48, 828.73s/it][A

Losses {'ner': 1930.882588409388}



100%|██████████| 20/20 [4:32:55<00:00, 818.76s/it]

Losses {'ner': 1917.192321644872}





In [27]:
# for text, _ in TRAIN_DATA:
#         doc = nlp(text)
#         print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
#         print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [28]:
nlp.to_disk('model')

In [29]:
nlp_test = spacy.load('model')

In [30]:
doc = nlp_test("A mansion in Gurgaon is on sale")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Gurgaon', 'B-geo')]
Tokens [('A', '', 2), ('mansion', '', 2), ('in', '', 2), ('Gurgaon', 'B-geo', 3), ('is', '', 2), ('on', '', 2), ('sale', '', 2)]
