In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model

In [43]:
# Load data from CSV file -> got this dataset from kaggle
data = pd.read_csv('../data/ner_dataset.csv', encoding="latin1")
# Take a quick look at the data -> notice that we have NaN for every sentence # except the first word in the sentence  
data.head(25)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [44]:
data.dtypes
curr_sentence = 1
# Iterate through the "Sentence #" column using .iteritems()
for index, sentence_num in data['Sentence #'].iteritems():
    if (pd.isna(sentence_num)):
        data.loc[index, 'Sentence #'] = curr_sentence # Replace NaN with correct sentence number
    else:
        curr_sentence = int(sentence_num.split(': ')[1])
        data.loc[index, 'Sentence #'] = curr_sentence # Replace Sentence: X with just X 
data.iloc[22:28]

  for index, sentence_num in data['Sentence #'].iteritems():


Unnamed: 0,Sentence #,Word,POS,Tag
22,1,country,NN,O
23,1,.,.,O
24,2,Families,NNS,O
25,2,of,IN,O
26,2,soldiers,NNS,O
27,2,killed,VBN,O


In [45]:
data['Sentence #'] = data['Sentence #'].astype('int64') # We want to make the sentence# column integers
data.dtypes

Sentence #     int64
Word          object
POS           object
Tag           object
dtype: object

In [46]:
tag_encoder = LabelEncoder()
data['Tag_index'] = tag_encoder.fit_transform(data['Tag'])

# Group data by sentence and create sequences of words and tags
grouped_data = data.groupby('Sentence #')[['Word', 'Tag_index']].apply(lambda x: x.values.tolist()).to_dict()
grouped_data[1] # We get a dictionary mapping each sentence to a list of pairs containing a given word + its tag

[['Thousands', 16],
 ['of', 16],
 ['demonstrators', 16],
 ['have', 16],
 ['marched', 16],
 ['through', 16],
 ['London', 2],
 ['to', 16],
 ['protest', 16],
 ['the', 16],
 ['war', 16],
 ['in', 16],
 ['Iraq', 2],
 ['and', 16],
 ['demand', 16],
 ['the', 16],
 ['withdrawal', 16],
 ['of', 16],
 ['British', 3],
 ['troops', 16],
 ['from', 16],
 ['that', 16],
 ['country', 16],
 ['.', 16]]

In [47]:
# Convert grouped data into sequences of sentences and labels
sentences = []
labels = []
for sentence, group in grouped_data.items():
    words = [item[0] for item in group]
    tags = [item[1] for item in group]
    sentences.append(words)
    labels.append(tags)
# Here, we produce a list of words for each sentence & a list of tags for each sentence
print (sentences[0])
print ("----------------------------------------------------------------------------------------------------------")
print (labels[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
----------------------------------------------------------------------------------------------------------
[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16, 16]


In [48]:
# Create an index for each unique word. Then create an index for unknown words
word_to_index = {}
for sentence in sentences:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
word_to_index['UNK'] = len(word_to_index)

In [49]:
index_to_tag = {idx: tag for idx, tag in enumerate(tag_encoder.classes_)}
print (index_to_tag) # map between tags and their indices

{0: 'B-art', 1: 'B-eve', 2: 'B-geo', 3: 'B-gpe', 4: 'B-nat', 5: 'B-org', 6: 'B-per', 7: 'B-tim', 8: 'I-art', 9: 'I-eve', 10: 'I-geo', 11: 'I-gpe', 12: 'I-nat', 13: 'I-org', 14: 'I-per', 15: 'I-tim', 16: 'O'}


In [50]:
sentences_indices = []

for sentence in sentences:
    sentence_indices = [word_to_index.get(word, word_to_index['UNK']) for word in sentence]
    sentences_indices.append(sentence_indices)
print (sentences_indices[0]) # Same as sentences, but we use index for each word

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]


In [51]:
num_classes = len(index_to_tag)

# Here, we want to pad the x (word indices) & y (tag indices) data so that each is the same length
# We'll cap out our sentences at whatever the word length of the longest sentence is
# Find the maximum sequence length for both X and y
max_sequence_length = max(max(len(sequence) for sequence in sentences_indices), max(len(sequence) for sequence in labels))
print ("MAX SEQUENCE LENGTH = " + str(max_sequence_length))

# Pad the word index sequences (x)
x = pad_sequences(sentences_indices, maxlen=max_sequence_length, padding='post', value=0)  # Assuming '0' is the padding value

# Pad the tag index sequences (y)
y = pad_sequences(labels, maxlen=max_sequence_length, padding='post', value=0)  # Assuming '0' is the padding value
y = np.array(y)

MAX SEQUENCE LENGTH = 104


In [52]:
# Sequences have extra zeroes at the end to normalize length
print(x[0])
print(y[0])

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14  9 15  1 16 17 18 19 20 21
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
[16 16 16 16 16 16  2 16 16 16 16 16  2 16 16 16 16 16  3 16 16 16 16 16
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define params
embedding_dim = 100
lstm_units = 64     
num_epochs = 5 # This ended up being well enough    
batch_size = 32
num_words = len(word_to_index)
num_tags = 17     

# Build bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True)))
model.add(TimeDistributed(Dense(units=num_tags, activation='softmax'))) # softmax for multiple categories

# We didn't use to_categorical() on the y data so we can use sparse instead of normal categorical crossentropy
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(x_test, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8d58f988e0>

In [54]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss: " + str(results[0]))
print("test accuracy: " + str(results[1]))

Evaluate on test data
test loss: 0.02473883517086506
test accuracy: 0.9931333065032959


In [55]:
# Test with example sentence
example_sentence = "Millions of people gathered in my home state of Massachusetts because Jim was there."

# Tokenize sentence, convert to word indices, & pad
words = example_sentence.split()
print("Split sentence:")
print(words)
sentence_length = len(words)
word_indices = [word_to_index.get(word, word_to_index['UNK']) for word in words]
padded_sequence = pad_sequences([word_indices], maxlen=max_sequence_length, padding='post', value=0)

# Make predictions
example_predictions = model.predict(padded_sequence)

# Print predictions
ex_tags = []
print("Predictions:")
for tag_probabilities in example_predictions[0]:
    ex_tags.append(index_to_tag[np.argmax(tag_probabilities)])
print (ex_tags[:sentence_length])


Split sentence:
['Millions', 'of', 'people', 'gathered', 'in', 'my', 'home', 'state', 'of', 'Massachusetts', 'because', 'Jim', 'was', 'there.']
Predictions:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'B-per', 'O', 'O']


In [56]:
# Export the model
model.save('ner_model.h5')

In [58]:
dict_str = repr(word_to_index)

# Save the dictionary to a separate script
with open("../app/ner_variable_storage.py", "w") as file:
    file.write("max_sequence_length = " + str(max_sequence_length) + "\n")
    file.write("word_to_index = " + dict_str)