https://www.kaggle.com/jatinmittal0001/ner-bi-lstm-dealing-with-oov-words/notebook#Data-Pre-processing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.metrics import classification_report
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
data=pd.read_csv("../../../Dataset/NER-Dataset/ner_dataset.csv",encoding="latin1")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
data['Sentence #']=data['Sentence #'].ffill(axis = 0) 
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["Word"].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                        s["Tag"].values.tolist())]

In [6]:
agg_data=data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
agg_data.head()

Unnamed: 0,Sentence #,Sentence_POS_Tag_Pair
0,Sentence: 1,"[(Thousands, NNS, O), (of, IN, O), (demonstrat..."
1,Sentence: 10,"[(Iranian, JJ, B-gpe), (officials, NNS, O), (s..."
2,Sentence: 100,"[(Helicopter, NN, O), (gunships, NNS, O), (Sat..."
3,Sentence: 1000,"[(They, PRP, O), (left, VBD, O), (after, IN, O..."
4,Sentence: 10000,"[(U.N., NNP, B-geo), (relief, NN, O), (coordin..."


In [7]:
agg_data['Sentence']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[0] for s in sentence]))
agg_data['POS']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[1] for s in sentence]))
agg_data['Tag']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[2] for s in sentence]))

In [8]:
agg_data.shape

(47959, 5)

In [9]:
agg_data.head()

Unnamed: 0,Sentence #,Sentence_POS_Tag_Pair,Sentence,POS,Tag
0,Sentence: 1,"[(Thousands, NNS, O), (of, IN, O), (demonstrat...",Thousands of demonstrators have marched throug...,NNS IN NNS VBP VBN IN NNP TO VB DT NN IN NNP C...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Sentence: 10,"[(Iranian, JJ, B-gpe), (officials, NNS, O), (s...",Iranian officials say they expect to get acces...,JJ NNS VBP PRP VBP TO VB NN TO JJ JJ NNS IN DT...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Sentence: 100,"[(Helicopter, NN, O), (gunships, NNS, O), (Sat...",Helicopter gunships Saturday pounded militant ...,"NN NNS NNP VBD JJ NNS IN DT NNP JJ NN , WRB JJ...",O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,Sentence: 1000,"[(They, PRP, O), (left, VBD, O), (after, IN, O...",They left after a tense hour-long standoff wit...,PRP VBD IN DT NN JJ NN IN NN NNS .,O O O O O O O O O O O
4,Sentence: 10000,"[(U.N., NNP, B-geo), (relief, NN, O), (coordin...",U.N. relief coordinator Jan Egeland said Sunda...,"NNP NN NN NNP NNP VBD NNP , NNP , JJ CC JJ JJ ...",B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [10]:
agg_data['tokenised_sentences']=agg_data['Sentence'].apply(lambda x:x.split())
agg_data['tag_list']=agg_data['Tag'].apply(lambda x:x.split())
agg_data.head()

Unnamed: 0,Sentence #,Sentence_POS_Tag_Pair,Sentence,POS,Tag,tokenised_sentences,tag_list
0,Sentence: 1,"[(Thousands, NNS, O), (of, IN, O), (demonstrat...",Thousands of demonstrators have marched throug...,NNS IN NNS VBP VBN IN NNP TO VB DT NN IN NNP C...,O O O O O O B-geo O O O O O B-geo O O O O O B-...,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 10,"[(Iranian, JJ, B-gpe), (officials, NNS, O), (s...",Iranian officials say they expect to get acces...,JJ NNS VBP PRP VBP TO VB NN TO JJ JJ NNS IN DT...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,Sentence: 100,"[(Helicopter, NN, O), (gunships, NNS, O), (Sat...",Helicopter gunships Saturday pounded militant ...,"NN NNS NNP VBD JJ NNS IN DT NNP JJ NN , WRB JJ...",O O B-tim O O O O O B-geo O O O O O B-org O O ...,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
3,Sentence: 1000,"[(They, PRP, O), (left, VBD, O), (after, IN, O...",They left after a tense hour-long standoff wit...,PRP VBD IN DT NN JJ NN IN NN NNS .,O O O O O O O O O O O,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 10000,"[(U.N., NNP, B-geo), (relief, NN, O), (coordin...",U.N. relief coordinator Jan Egeland said Sunda...,"NNP NN NN NNP NNP VBD NNP , NNP , JJ CC JJ JJ ...",B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [11]:
agg_data['len_sentence']=agg_data['tokenised_sentences'].apply(lambda x:len(x))
agg_data['len_tag']=agg_data['tag_list'].apply(lambda x:len(x))
agg_data['is_equal']=agg_data.apply(lambda row:1 if row['len_sentence']==row['len_tag'] else 0,axis=1)
agg_data['is_equal'].value_counts()

1    47955
0        4
Name: is_equal, dtype: int64

In [12]:
agg_data.head()

Unnamed: 0,Sentence #,Sentence_POS_Tag_Pair,Sentence,POS,Tag,tokenised_sentences,tag_list,len_sentence,len_tag,is_equal
0,Sentence: 1,"[(Thousands, NNS, O), (of, IN, O), (demonstrat...",Thousands of demonstrators have marched throug...,NNS IN NNS VBP VBN IN NNP TO VB DT NN IN NNP C...,O O O O O O B-geo O O O O O B-geo O O O O O B-...,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...",24,24,1
1,Sentence: 10,"[(Iranian, JJ, B-gpe), (officials, NNS, O), (s...",Iranian officials say they expect to get acces...,JJ NNS VBP PRP VBP TO VB NN TO JJ JJ NNS IN DT...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...",25,25,1
2,Sentence: 100,"[(Helicopter, NN, O), (gunships, NNS, O), (Sat...",Helicopter gunships Saturday pounded militant ...,"NN NNS NNP VBD JJ NNS IN DT NNP JJ NN , WRB JJ...",O O B-tim O O O O O B-geo O O O O O B-org O O ...,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...",32,32,1
3,Sentence: 1000,"[(They, PRP, O), (left, VBD, O), (after, IN, O...",They left after a tense hour-long standoff wit...,PRP VBD IN DT NN JJ NN IN NN NNS .,O O O O O O O O O O O,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]",11,11,1
4,Sentence: 10000,"[(U.N., NNP, B-geo), (relief, NN, O), (coordin...",U.N. relief coordinator Jan Egeland said Sunda...,"NNP NN NN NNP NNP VBD NNP , NNP , JJ CC JJ JJ ...",B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...",35,35,1


In [13]:
sentences_list=agg_data['Sentence'].tolist()
tags_list=agg_data['tag_list'].tolist()

print("Number of Sentences in the Data ",len(sentences_list))
print("Are number of Sentences and Tag list equal ",len(sentences_list)==len(tags_list))

Number of Sentences in the Data  47959
Are number of Sentences and Tag list equal  True


In [14]:
tags_list[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O']

In [15]:
tokeniser= tf.keras.preprocessing.text.Tokenizer(lower=False,filters='')

tokeniser.fit_on_texts(sentences_list)

In [16]:
print("Vocab size of Tokeniser ",len(tokeniser.word_index)+1) 

Vocab size of Tokeniser  35179


In [17]:
tokeniser.index_word[326]

'national'

In [18]:
encoded_sentence=tokeniser.texts_to_sequences(sentences_list)
print("First Original Sentence ",sentences_list[0])
print("First Encoded Sentence ",encoded_sentence[0])
print("Is Length of Original Sentence Same as Encoded Sentence ",len(sentences_list[0].split())==len(encoded_sentence[0]))
print("Length of First Sentence ",len(encoded_sentence[0]))

First Original Sentence  Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
First Encoded Sentence  [1114, 4, 1161, 16, 1852, 229, 478, 6, 533, 1, 155, 5, 58, 8, 582, 1, 843, 4, 179, 87, 21, 15, 52, 2]
Is Length of Original Sentence Same as Encoded Sentence  True
Length of First Sentence  24


In [19]:
tags=list(set(data['Tag'].values))
print(tags)
num_tags=len(tags)
print("Number of Tags ",num_tags)

tags_map={tag:i for i,tag in enumerate(tags)}
print("Tags Map ",tags_map)

['I-art', 'B-org', 'B-gpe', 'O', 'B-per', 'I-tim', 'I-per', 'I-gpe', 'I-eve', 'I-org', 'B-tim', 'B-nat', 'B-geo', 'B-eve', 'B-art', 'I-nat', 'I-geo']
Number of Tags  17
Tags Map  {'I-art': 0, 'B-org': 1, 'B-gpe': 2, 'O': 3, 'B-per': 4, 'I-tim': 5, 'I-per': 6, 'I-gpe': 7, 'I-eve': 8, 'I-org': 9, 'B-tim': 10, 'B-nat': 11, 'B-geo': 12, 'B-eve': 13, 'B-art': 14, 'I-nat': 15, 'I-geo': 16}


In [20]:
reverse_tag_map={v: k for k, v in tags_map.items()}

In [21]:
encoded_tags=[[tags_map[w] for w in tag] for tag in tags_list]
print("First Sentence ",sentences_list[0])
print('First Sentence Original Tags ',tags_list[0])
print("First Sentence Encoded Tags ",encoded_tags[0])
print("Is length of Original Tags and Encoded Tags same ",len(tags_list[0])==len(encoded_tags[0]))
print("Length of Tags for First Sentence ",len(encoded_tags[0]))

First Sentence  Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
First Sentence Original Tags  ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
First Sentence Encoded Tags  [3, 3, 3, 3, 3, 3, 12, 3, 3, 3, 3, 3, 12, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3]
Is length of Original Tags and Encoded Tags same  True
Length of Tags for First Sentence  24


In [22]:
max_sentence_length=max([len(s.split()) for s in sentences_list])
print(max_sentence_length)

104


In [23]:
max_len=128
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

padded_encoded_sentences=pad_sequences(maxlen=max_len,sequences=encoded_sentence,padding="post",value=0)
padded_encoded_tags=pad_sequences(maxlen=max_len,sequences=encoded_tags,padding="post",value=tags_map['O'])

print("Shape of Encoded Sentence ",padded_encoded_sentences.shape)
print("Shape of Encoded Labels ",padded_encoded_tags.shape)

print("First Encoded Sentence Without Padding ",encoded_sentence[0])
print("First Encoded Sentence with padding ",padded_encoded_sentences[0])
print("First Sentence Encoded Label without Padding ",encoded_tags[0])
print("First Sentence Encoded Label with Padding ",padded_encoded_tags[0])

Shape of Encoded Sentence  (47959, 128)
Shape of Encoded Labels  (47959, 128)
First Encoded Sentence Without Padding  [1114, 4, 1161, 16, 1852, 229, 478, 6, 533, 1, 155, 5, 58, 8, 582, 1, 843, 4, 179, 87, 21, 15, 52, 2]
First Encoded Sentence with padding  [1114    4 1161   16 1852  229  478    6  533    1  155    5   58    8
  582    1  843    4  179   87   21   15   52    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
First Sentence Encoded Label without Padding  [3, 3, 3, 3, 3, 3, 12, 3, 3, 3, 3, 3, 12, 3, 3

In [24]:
target= [to_categorical(i,num_classes = num_tags) for i in  padded_encoded_tags]
print("Shape of Labels  after converting to Categorical for first sentence ",target[0].shape)

Shape of Labels  after converting to Categorical for first sentence  (128, 17)


In [25]:
from sklearn.model_selection import train_test_split
X_train,X_val_test,y_train,y_val_test = train_test_split(padded_encoded_sentences,target,test_size = 0.3,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_val_test,y_val_test,test_size = 0.2,random_state=42)
print("Input Train Data Shape ",X_train.shape)
print("Train Labels Length ",len(y_train))
print("Input Test Data Shape ",X_test.shape)
print("Test Labels Length ",len(y_test))

print("Input Validation Data Shape ",X_val.shape)
print("Validation Labels Length ",len(y_val))

Input Train Data Shape  (33571, 128)
Train Labels Length  33571
Input Test Data Shape  (2878, 128)
Test Labels Length  2878
Input Validation Data Shape  (11510, 128)
Validation Labels Length  11510


In [26]:
print("Shape of First Sentence -Train",X_train[0].shape)
print("Shape of First Sentence Label  -Train",y_train[0].shape)

Shape of First Sentence -Train (128,)
Shape of First Sentence Label  -Train (128, 17)


In [27]:
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional

# Model 1 LSTM

In [28]:
embedding_dim=128
vocab_size=len(tokeniser.word_index)+1
lstm_units=128
max_len=128

input_word = Input(shape = (max_len,))
model = Embedding(input_dim = vocab_size+1,output_dim = embedding_dim,input_length = max_len)(input_word)

model = LSTM(units=embedding_dim,return_sequences=True)(model)
out = TimeDistributed(Dense(num_tags,activation = 'softmax'))(model)
model = Model(input_word,out)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 128)          4503040   
_________________________________________________________________
lstm (LSTM)                  (None, 128, 128)          131584    
_________________________________________________________________
time_distributed (TimeDistri (None, 128, 17)           2193      
Total params: 4,636,817
Trainable params: 4,636,817
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [30]:
history = model.fit(X_train,np.array(y_train),validation_data=(X_val,np.array(y_val)),batch_size = 32,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [31]:
preds=model.predict(X_test) ## Predict using model on Test Data


# Model 2 Bi-LSTM

In [32]:
input_word2 = Input(shape = (max_len,))
model2 = Embedding(input_dim = vocab_size+1,output_dim = embedding_dim,input_length = max_len)(input_word2)

model2 = Bidirectional(LSTM(units=embedding_dim,return_sequences=True))(model2)
out2 = TimeDistributed(Dense(num_tags,activation = 'softmax'))(model2)
model2 = Model(input_word2,out2)
model2.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 128)          4503040   
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 256)          263168    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 128, 17)           4369      
Total params: 4,770,577
Trainable params: 4,770,577
Non-trainable params: 0
_________________________________________________________________


In [33]:
model2.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [34]:
history2 = model2.fit(X_train,np.array(y_train),validation_data=(X_val,np.array(y_val)),batch_size = 32,epochs = 6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [35]:
preds2=model2.predict(X_test) ## Predict using model on Test Data

# Evaluation

In [36]:
def evaluatePredictions(test_data,preds,actual_preds):
    print("Shape of Test Data Array",test_data.shape)
    y_actual=np.argmax(np.array(actual_preds),axis=2)
    y_pred=np.argmax(preds,axis=2)
    num_test_data=test_data.shape[0]
    print("Number of Test Data Points ",num_test_data)
    data=pd.DataFrame()
    df_list=[]
    for i in range(num_test_data):
        test_str=list(test_data[i])
        df=pd.DataFrame()
        df['test_tokens']=test_str
        df['tokens']=df['test_tokens'].apply(lambda x:tokeniser.index_word[x] if x!=0 else '<PAD>')
        df['actual_target_index']=list(y_actual[i])
        df['pred_target_index']=list(y_pred[i])
        df['actual_target_tag']=df['actual_target_index'].apply(lambda x:reverse_tag_map[x])
        df['pred_target_tag']=df['pred_target_index'].apply(lambda x:reverse_tag_map[x])
        df['id']=i+1
        df_list.append(df)
    data=pd.concat(df_list)
    pred_data=data[data['tokens']!='<PAD>']
    accuracy=pred_data[pred_data['actual_target_tag']==pred_data['pred_target_tag']].shape[0]/pred_data.shape[0]
    
    
    return pred_data,accuracy
        
# pred_data[pred_data['actual_target_tag']=="B-art"]

In [37]:
pred_data,accuracy=evaluatePredictions(X_test,preds,y_test)
pred_data2,accuracy2=evaluatePredictions(X_test,preds2,y_test)

Shape of Test Data Array (2878, 128)
Number of Test Data Points  2878
Shape of Test Data Array (2878, 128)
Number of Test Data Points  2878


In [38]:
y_pred=pred_data['pred_target_tag'].tolist()
y_actual=pred_data['actual_target_tag'].tolist()

y_pred2=pred_data2['pred_target_tag'].tolist()
y_actual2=pred_data2['actual_target_tag'].tolist()

In [39]:
print(classification_report(y_actual,y_pred))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        27
       B-eve       0.75      0.18      0.29        17
       B-geo       0.78      0.90      0.83      2151
       B-gpe       0.95      0.92      0.94       919
       B-nat       0.00      0.00      0.00         5
       B-org       0.78      0.52      0.63      1305
       B-per       0.85      0.78      0.81      1062
       B-tim       0.88      0.81      0.84      1197
       I-art       0.00      0.00      0.00        30
       I-eve       0.00      0.00      0.00        15
       I-geo       0.71      0.79      0.75       413
       I-gpe       0.00      0.00      0.00        11
       I-org       0.81      0.72      0.76      1053
       I-per       0.86      0.84      0.85      1066
       I-tim       0.85      0.53      0.66       401
           O       0.98      0.99      0.99     52763

    accuracy                           0.96     62435
   macro avg       0.57   

  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
print(classification_report(y_actual2,y_pred2))

              precision    recall  f1-score   support

       B-art       0.25      0.07      0.11        27
       B-eve       0.42      0.29      0.34        17
       B-geo       0.83      0.89      0.86      2151
       B-gpe       0.96      0.94      0.95       919
       B-nat       0.14      0.20      0.17         5
       B-org       0.82      0.64      0.72      1305
       B-per       0.79      0.82      0.80      1062
       B-tim       0.89      0.89      0.89      1197
       I-art       0.18      0.07      0.10        30
       I-eve       0.31      0.27      0.29        15
       I-geo       0.76      0.78      0.77       413
       I-gpe       0.86      0.55      0.67        11
       I-nat       0.00      0.00      0.00         0
       I-org       0.85      0.71      0.77      1053
       I-per       0.85      0.87      0.86      1066
       I-tim       0.76      0.79      0.77       401
           O       0.99      0.99      0.99     52763

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
