In [3]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
print(tf.__version__)

2.3.0


In [4]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data from Kaggle

https://www.kaggle.com/namanj27/ner-dataset


In [5]:
raw_data = pd.read_csv('/content/drive/My Drive/dat390_data/ner_datasetreference.csv', encoding = "ISO-8859-1")
raw_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [8]:
data = raw_data
from itertools import chain
tok2idx = {}
idx2tok = {}
vocab = list(set(data['Word'].to_list()))
idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
data['Word_idx'] = data['Word'].map(tok2idx)


tok2idx = {}
idx2tok = {}
tags = list(set(data['Tag'].to_list()))
idx2tag = {idx:tok for  idx, tok in enumerate(tags)}
tag2idx = {tok:idx for  idx, tok in enumerate(tags)}
data['Tag_idx'] = data['Tag'].map(tag2idx)

In [9]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,29370,2
1,,of,IN,O,16591,2
2,,demonstrators,NNS,O,29799,2
3,,have,VBP,O,2817,2
4,,marched,VBN,O,11835,2


In [10]:
n_tags = len(tags)
n_vocab = len(vocab)
n_tags, n_vocab

(17, 35178)

In [11]:
print(tags)

['I-geo', 'I-per', 'O', 'B-org', 'I-org', 'B-gpe', 'I-nat', 'B-per', 'B-art', 'I-gpe', 'I-art', 'B-eve', 'I-eve', 'B-geo', 'B-nat', 'I-tim', 'B-tim']


In [12]:
data_fillna = data.fillna(method='ffill', axis=0)
data_sentences = data_fillna.groupby(['Sentence #'], as_index=False
                                )['Word', 'POS', 'Tag', 'Word_idx', 
                                  'Tag_idx'].agg(lambda x: list(x))

  after removing the cwd from sys.path.


In [13]:
' '.join(data_sentences.iloc[0]['Word'])

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [14]:
inputs = np.array(data_sentences['Word_idx'])
targets = np.array(data_sentences['Tag_idx'])
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, padding='post', truncating='post', value= tag2idx["O"])

In [15]:
input_len = len(inputs[0])

In [16]:
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.1, random_state=42)

In [17]:
train_inputs = np.array(train_inputs).reshape(len(train_inputs), 104, 1)
train_targets = np.array(train_targets)
test_inputs = np.array(test_inputs).reshape(len(test_inputs), 104, 1)
test_targets = np.array(test_targets)

## Building LSTM network

In [19]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
tf.random.set_seed(2)

In [37]:
model = Sequential()
model.add(Embedding(input_dim=n_vocab+1, output_dim=8))
model.add(Bidirectional(LSTM(units=8, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
model.add(LSTM(units=8, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_tags, activation="relu"))
model.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 8)           281432    
_________________________________________________________________
bidirectional_8 (Bidirection (None, None, 16)          1088      
_________________________________________________________________
lstm_17 (LSTM)               (None, None, 8)           800       
_________________________________________________________________
dense_7 (Dense)              (None, None, 17)          153       
Total params: 283,473
Trainable params: 283,473
Non-trainable params: 0
_________________________________________________________________


In [38]:
history = model.fit(train_inputs, train_targets, batch_size=1000, epochs=3, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [39]:
model.evaluate(test_inputs, test_targets)



[0.31910961866378784, 0.967915952205658]

## Refereences:

https://github.com/snehalnair/Named-Entity-Recognition/

https://www.kaggle.com/namanj27/ner-dataset
