In [162]:
from __future__ import unicode_literals
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
import copy
import re

In [4]:
data = pd.read_csv("NER_keras\\Named-Entity-Recognition_DeepLearning-keras-master\\ner_dataset.csv", encoding="latin1")
data = data.drop(['POS'], axis =1)
data = data.fillna(method="ffill")

In [69]:
words = set(list(data['Word'].values))
words.add('PADword')
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)

In [73]:
# Changing data format for consumption by spacy
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(data)
sentences = getter.sentences

In [83]:
words2index = {w:i for i,w in enumerate(words)}
tags2index = {t:i for i,t in enumerate(tags)}
tags2index

{'B-gpe': 0,
 'I-geo': 1,
 'B-eve': 2,
 'B-per': 3,
 'I-org': 4,
 'B-tim': 5,
 'I-eve': 6,
 'B-nat': 7,
 'I-tim': 8,
 'I-nat': 9,
 'I-gpe': 10,
 'B-geo': 11,
 'B-org': 12,
 'B-art': 13,
 'O': 14,
 'I-art': 15,
 'I-per': 16}

In [75]:
max_len = 50
X = [[w[0]for w in s] for s in sentences]
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:v
            new_seq.append("PADword")
    new_X.append(new_seq)

In [78]:
y = [[tags2index[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"])

In [79]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(new_X, y, test_size=0.1, random_state=2018)
batch_size = 32
X_te = X_te[:149*batch_size]
y_te = y_te[:149*batch_size]

In [107]:
index2tags = {val:key for key,val in tags2index.items()}
print (index2tags)
with open('kaggle_test.txt','w',encoding='utf-8') as f:
    with open('kaggle_test_truelabels.txt','w',encoding='utf-8') as g:
        for i in range(len(X_te)):
            for j in range(len(X_te[0])):
                #print (X_te[i][j],y_te[i][j])
                if X_te[i][j]!='PADword':
                    f.write(X_te[i][j]+' ')
                    g.write(index2tags[y_te[i][j]]+' ')
            f.write('\n')
            g.write('\n')
            

{0: 'B-gpe', 1: 'I-geo', 2: 'B-eve', 3: 'B-per', 4: 'I-org', 5: 'B-tim', 6: 'I-eve', 7: 'B-nat', 8: 'I-tim', 9: 'I-nat', 10: 'I-gpe', 11: 'B-geo', 12: 'B-org', 13: 'B-art', 14: 'O', 15: 'I-art', 16: 'I-per'}


In [None]:
model = 'en_core_web_lg'
nlp = spacy.load('D:\Softwares\Anaconda2\envs\\ner\Lib\site-packages\\'+model+'\\'+model+'-2.0.0')
with open('kaggle_test_text.txt','r') as g:
    lines = g.readlines()
    
with open('kaggle_test_truelabels.txt','r') as g:
    true_labels = g.readlines()

In [202]:
# precision = detected positive and true/all predicted positives
# recall = detected positive and true/all true positives
error = 0
pr_num,pr_den,re_num,re_den = 0.0,0.0,0.0,0.0 
for i in range(len(lines)): 
    text = re.sub(' +,+',' ',lines[i]).strip('\n').strip()
    doc = nlp(text)    
    #print text
    dic = defaultdict(int)
    
    li_index = [0]
    for k in range(len(text)):
        if text[k]==' ':
            li_index.append(k+1)
    pred_labels = copy.copy(li_index)
   
    for ent in doc.ents:
        #print(ent.text,ent.label_,ent.start_char,ent.end_char)
        
        if ent.label_=='ORG':
            flag = 1
            try:
                j = li_index.index(ent.start_char)          
                while j <len(li_index):
                    if li_index[j]>=ent.end_char:
                        break
                    #print(type(li_index[j]))
                    if flag == 1:
                        pred_labels[j] = 'B-org'
                        flag = 0
                        j += 1
                    #print (li_index[j],type(li_index[j]))
                    else:
                        pred_labels[j] = 'I-org'
                        j += 1
            except:
                pass

    for k in range(len(pred_labels)):
        if str(pred_labels[k]).isdigit():
            pred_labels[k] = 'O'

    for m in range(len(pred_labels)):
        if 'org' in true_labels[i].split()[m]:
            re_den += 1
        if 'org' in pred_labels[m]:
            pr_den += 1
            if 'org' in true_labels[i].split()[m]:
                pr_num += 1
                re_num += 1
                
print('Results for spacy model on the test set')
print ('Precision = ',pr_num/pr_den)
print ('Recall = ',re_num/re_den)
print ('F1 score =', 2 * (pr_num/pr_den) * (re_num/re_den)/((pr_num/pr_den) + (re_num/re_den)))

Results for spacy model on the test set
Precision =  0.686838124054463
Recall =  0.5922254109052961
F1 score = 0.636032502101429
