# Step 1: The dataset we will use for training the model can be found here: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus#ner_dataset.csv. Create a new environment or use an existing environment to create a new Jupyter Notebook. In the Notebook, read in as many sentences of the ner_dataset as your computer can handle. I chose 6000, so I read in 131419 rows of the dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

data = pd.read_csv("ner_dataset.csv", encoding="latin1")

# Read in portion of dataset with 6000 sentences
sample_size = 131419

data = data.head(sample_size)

data = data.fillna(method="ffill")
data.tail(10)

In [None]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

In [None]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

# Step 2: Use the SentenceGetter class to retrieve sentences with their labels.

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)

In [None]:
sent = getter.get_next()

### This is how a sentence looks now.

In [None]:
print(sent)

# Step 3: Get all sentences

In [None]:
sentences = getter.sentences

# Step 4: Set max sentence length to 50. Create dictionary of tags to map labels to numbers.

In [None]:
max_len = 50
tag2idx = {t: i for i, t in enumerate(tags)}

# Step 5: Split the ‘Word’ portion of the sampled dataset into a list of sentences each containing a list of words.

In [None]:
X = [[w[0] for w in s] for s in sentences]

# Step 6: Add padding to each list of words in sentence list until the len of the word list is 50.

In [None]:
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

# Step 7: Also split the tags into lists within lists.

In [None]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

# Step 8: Pad the tags as well

In [None]:
from keras.preprocessing.sequence import pad_sequences
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

# Step 9: Set batch size.

In [None]:
batch_size = 50

# Step 10: Initialize the ELMo embedding from tensorflow hub.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

# Step 11: Initialize the tensorflow session.

In [None]:
sess = tf.Session()
K.set_session(sess)

# Step 12: Download bi-directional LSTM model with ELMo word embeddings. ELMo word embeddings allow the model to learn both word (e.g., syntax and semantics) and linguistic context of a training dataset.

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

### If you prefer to download the model from here and load it from your computer (in case the website is down): save “saved_model.pb” and “tfhub_module.pb” in a folder called “9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d.” Make sure to create two new folders called “assets” and “variables” within the folder. Then change “https://tfhub.dev/google/elmo/2” to the long folder name.

# Step 13: Create a function that vectorizes a sequence of strings with the ELMo embedding.

In [None]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

# Step 14: Create a residual LSTM network with an ELMo embedding layer.

In [None]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [None]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [None]:
model = Model(input_text, out)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Step 15: In this step I set the training and test data. 

### 80% of the 6,000 total sentences (contained in the variable X) of the sampled dataset will be used to train the model. The remaining 20% will be used to validate the model.

### If you prefer to use a different number of sentences (other than 6,000) for training and validation, make sure you choose a sample_size that contains your prefered number of sentences.

### Also make sure that the number of sentences you select, train_size and val_size are each divisible by the batch_size. Feel free to alter any of these parameters, the batch_size or train/test split proportion to achieve this.

In [None]:
num_sentences = len(X)

train_size = int(num_sentences * .8)
val_size = num_sentences - train_size

X_tr, X_val = X[:train_size], X[-val_size:]
y_tr, y_val = y[:train_size], y[-val_size:]

#The input layer of NN is in 3D so reshape makes labels 3D
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

# Step 16: Train the model.

### At the end of the first epoch, the following error may be thrown: “InvalidArgumentError (see above for traceback): assertion failed: [Expected shape for Tensor sequence_length:0 is ] [?] [ but saw shape: ] [50]”

### If you see this error, change the batch_size to the number represented by ?. Then, train the model again.

In [None]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5, verbose=1)

# Step 17: Plot

In [None]:
hist = pd.DataFrame(history.history)

In [None]:
plt.figure(figsize=(12,12))
plt.plot(hist["acc"])
plt.plot(hist["val_acc"])
plt.title("Learning curves")
plt.legend()
plt.show()

# Step 18: Save the model. This way you will not have to keep retraining the model each time you run Jupyter Notebook.

In [None]:
model.save_weights('./testmodel_weights')

type(model)

# Step 19: Be sure to pickle the tags (tuples). If you need to reload the saved trained model, you will also need to reload the tags with which the model was trained.

In [None]:
import pickle

with open('tags.pickle', 'wb') as f:
    pickle.dump(tags, f)