In [1]:
import pandas as pd
import numpy as np
import nltk
import keras
import sklearn

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Name Entity recognition

In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, Dropout, Bidirectional
import unicodedata


# Maximum length of text sentences
MAXLEN = 180
# Number of LSTM units
LSTM_N = 150
# batch size
BS=48

In [3]:
data=pd.read_csv("ner_dataset.csv", encoding="latin-1")


In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
# Creating a vocabulary
print("Number of uniques docs, sentences and words in Training set:\n",data.nunique())

words = list(set(data["Word"].values))
words.append("ENDPAD")
 
# Converting greek characters to ASCII characters eg. 'naïve café' to 'naive cafe'
words = [unicodedata.normalize('NFKD', str(w)).encode('ascii','ignore') for w in words]
n_words = len(words)


print("\nLength of vocabulary = ",n_words)
 
tags = list(set(data["Tag"].values))
n_tags = len(tags)
print("\nnumber of tags = ",n_tags)
 
# Creating words to indices dictionary.
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}

Number of uniques docs, sentences and words in Training set:
 Sentence #    47959
Word          35178
POS              42
Tag              17
dtype: int64

Length of vocabulary =  35179

number of tags =  17


In [6]:
type(data['Sentence #'][1])==float

True

In [7]:
counter=0
list_=[]
for sent in data['Sentence #']:
    if type(sent)!=float:
        counter+=1
    list_.append(counter)
data['Sent_ID']=list_

In [8]:
split_tresh=data.index[data['Sentence #'] == "Sentence: "+str(round(47959*0.80))][0]
test_data=data.iloc[split_tresh:,:]
train_data=data.iloc[:split_tresh,:]

In [9]:
def get_tagged_sentences(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["Tag"].values.tolist())]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
 
def get_test_sentences(data):
    agg_func = lambda s: [w for w in s["Word"].values.tolist()]
    grouped = data.groupby("Sent").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
# Getting training sentences in a list
sentences = get_tagged_sentences(train_data)
test_sentences = get_tagged_sentences(test_data)
print("First 2 sentences in a word list format:\n",sentences[0:2])

First 2 sentences in a word list format:
 [[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')], [('Families', 'O'), ('of', 'O'), ('soldiers', 'O'), ('killed', 'O'), ('in', 'O'), ('the', 'O'), ('conflict', 'O'), ('joined', 'O'), ('the', 'O'), ('protesters', 'O'), ('who', 'O'), ('carried', 'O'), ('banners', 'O'), ('with', 'O'), ('such', 'O'), ('slogans', 'O'), ('as', 'O'), ('"', 'O'), ('Bush', 'B-per'), ('Number', 'O'), ('One', 'O'), ('Terrorist', 'O'), ('"', 'O'), ('and', 'O'), ('"', 'O'), ('Stop', 'O'), ('the', 'O'), ('Bombings', 'O'), ('.', 'O'), ('"', 'O')]]


In [10]:
X = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in sentences]
 
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in test-set eg. 'naïve café' to 'naive cafe'
X_test = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in test_sentences]

X = keras.preprocessing.sequence.pad_sequences(maxlen=MAXLEN, sequences=X, padding="post", value=n_words - 1)
X_test = keras.preprocessing.sequence.pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)
 
# Converting tags to indices for test sentences (labels)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Padding tag labels to 180 words.
y = keras.preprocessing.sequence.pad_sequences(maxlen=MAXLEN, sequences=y, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [11]:
# 180 dimensional word indices as input
input = Input(shape=(MAXLEN,))
 
# Embedding layer of same length output (180 dim embedding will be generated)
model = Embedding(input_dim=n_words, output_dim=MAXLEN, input_length=MAXLEN)(input)
 
# Adding dropout layer
model = Dropout(0.2)(model)
 
# Bidirectional LSTM to learn from both forward as well as backward context
model = Bidirectional(LSTM(units=LSTM_N, return_sequences=True, recurrent_dropout=0.1))(model)
 
# Adding a TimeDistributedDense, to applying a Dense layer on each 180 timesteps
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer
model = Model(input, out)
 
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])
history = model.fit(X, np.array(y), batch_size=BS, epochs=2, validation_split=0.05, verbose=1)

Train on 36447 samples, validate on 1919 samples
Epoch 1/2
Epoch 2/2


In [12]:
t=model.predict(X_test)

In [13]:
pred_index = np.argmax(t, axis=-1)

In [14]:
ids,tagids = X_test.flatten().tolist(), pred_index.flatten().tolist()

words_test = [words[ind].decode('utf-8') for ind in ids]
# converting each predicted tag indices back to tags
tags_test = [tags[ind] for ind in tagids]

true_index = np.argmax(y, axis=-1)
pred=true_index.flatten().tolist()
true_tags= [tags[ind] for ind in pred]


In [15]:
y_test= [[tag2idx[w[1]] for w in s] for s in test_sentences]
# Padding tag labels to 180 words.
y_test = keras.preprocessing.sequence.pad_sequences(maxlen=MAXLEN, sequences=y_test, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]
          
          
true_index = np.argmax(y_test, axis=-1)
pred=true_index.flatten().tolist()
true_tags= [tags[ind] for ind in pred]

In [16]:
df=pd.DataFrame({'word':words_test,"True_tag":true_tags,"pred_tags":tags_test})

In [17]:
df.head(20)

Unnamed: 0,word,True_tag,pred_tags
0,Mr.,B-per,B-per
1,Nour,I-per,I-per
2,was,O,O
3,arrested,O,O
4,in,O,O
5,January,B-tim,B-tim
6,and,O,I-tim
7,spent,O,O
8,six,B-tim,B-tim
9,weeks,O,O


In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 180)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 180, 180)          6332220   
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 180)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 180, 300)          397200    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 180, 17)           5117      
Total params: 6,734,537
Trainable params: 6,734,537
Non-trainable params: 0
_________________________________________________________________
