# LSTM for Named Entities

In [1]:
# libraries

import numpy as np
import pandas as pd
from itertools import chain
import tensorflow

# model
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, Model, Input, optimizers
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from gensim.models import KeyedVectors

In [2]:
# Setting path to data and embeddings

# conll data
path_train ='./datas/conll2003.train.conll' #adapt
path_eval = './datas/conll2003.dev.conll' # adapt
path_test = './datas/conll2003.test.conll'
paths = [path_train, path_test]

# change to test if you are evaluating on test:
eval_split = 'test'
# model output path
output_path = './datas/test_lsmt-out.csv' # adapt

# embedding model
path_emb = './models/GoogleNews-vectors-negative300.bin.gz'

# Data Preparation

In [3]:
def convert_data(paths):
    
    data = []
    sent_id = 1
    for path in paths:
        split = path.split('.')[-2]
        with open(path) as infile:
            lines = infile.read().split('\n')
        for n, line in enumerate(lines):
            ll = line.split('\t')
            if len(ll) > 2:
                d = dict()
                d['Sentence #'] = f'Sentence: {sent_id}'
                d['Word'] = ll[0]
                d['POS'] = ll[1]
                d['Tag'] = ll[-1]
                d['Split'] = split
                data.append(d)

            else:
                sent_id += 1
    data = pd.DataFrame(data)
    return data

In [4]:
data = convert_data(paths)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Split
0,Sentence: 1,EU,NNP,B-ORG,train
1,Sentence: 1,rejects,VBZ,O,train
2,Sentence: 1,German,JJ,B-MISC,train
3,Sentence: 1,call,NN,O,train
4,Sentence: 1,to,TO,O,train


In [5]:
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag,Split
250051,Sentence: 17495,younger,JJR,O,test
250052,Sentence: 17495,brother,NN,O,test
250053,Sentence: 17495,",",",",O,test
250054,Sentence: 17495,Bobby,NNP,B-PER,test
250055,Sentence: 17495,.,.,O,test


In [6]:
# mapping tokens and labels to indices

def get_dict_map(data, token_or_tag, embedding_model=None):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}   
    
    return tok2idx, idx2tok

In [7]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
print(len(token2idx))
print(len(tag2idx))

27316
9


# Integrating Embeddings

In [8]:
# Load embedding model
w2v_model = KeyedVectors.load_word2vec_format(path_emb, binary=True)

In [9]:
# Create embedding matrix with zero vectors for oov words
emb_dim = 300
embedding_matrix = np.zeros((len(token2idx) + 1, emb_dim))
print(embedding_matrix.shape)
for word, i in token2idx.items():
    # You may have to change the following line to:
    # if word in w2v_model:
    if word in w2v_model.key_to_index:
        embedding_vector = w2v_model[word]
    else:
        embedding_vector = None
        # If you want to check OOV words:
        #print('couldnt find:', word, i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

(27317, 300)


In [10]:
# Check dimensions, store number of vector dimensions in variable
print(embedding_matrix.shape)
emb_dim = embedding_matrix.shape[1]
print(emb_dim)

(27317, 300)
300


In [11]:
# Add index info to dataframe
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Split,Word_idx,Tag_idx
0,Sentence: 1,EU,NNP,B-ORG,train,11407,6
1,Sentence: 1,rejects,VBZ,O,train,26017,4
2,Sentence: 1,German,JJ,B-MISC,train,23685,5
3,Sentence: 1,call,NN,O,train,26777,4
4,Sentence: 1,to,TO,O,train,16931,4


In [12]:
# Group data by sentences
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx', 'Split'].agg(lambda x: list(x))
# Visualise data
data_group.head()

  data_group = data_fillna.groupby(


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx,Split
0,Sentence: 1,"[EU, rejects, German, call, to, boycott, Briti...","[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[11407, 26017, 23685, 26777, 16931, 947, 20015...","[6, 4, 5, 4, 4, 4, 5, 4, 4]","[train, train, train, train, train, train, tra..."
1,Sentence: 10,"[But, Fischler, agreed, to, review, his, propo...","[CC, NNP, VBD, TO, VB, PRP$, NN, IN, DT, NNP, ...","[O, B-PER, O, O, O, O, O, O, O, B-ORG, O, O, O...","[9533, 2224, 26846, 16931, 18963, 12657, 7142,...","[4, 2, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, ...","[train, train, train, train, train, train, tra..."
2,Sentence: 100,"[The, Syrians, are, confused, ,, they, are, de...","[DT, NNPS, VBP, VBN, ,, PRP, VBP, RB, JJ, ,, C...","[O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O...","[10971, 21394, 26300, 7802, 4596, 25203, 26300...","[4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[train, train, train, train, train, train, tra..."
3,Sentence: 1000,"[The, youth, side, replied, with, 246, for, se...","[DT, NN, NN, VBD, IN, CD, IN, CD, .]","[O, O, O, O, O, O, O, O, O]","[10971, 3595, 22552, 7232, 8089, 24535, 8119, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4]","[train, train, train, train, train, train, tra..."
4,Sentence: 10000,"[Men, 's, 3,000, metres, :]","[NN, POS, CD, NNS, :]","[O, O, O, O, O]","[18222, 20359, 24054, 14820, 2431]","[4, 4, 4, 4, 4]","[train, train, train, train, train]"


In [13]:
# Change eval_split from 'dev' to test to run on test data
def get_pad_train_test_val(data_group, data, eval_split='dev'):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    print(n_token)

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int64', padding='post', value= 26883)
    print('padding', len(pad_tokens[0]))
    # I used the code below to check the if the padded vectors are set to 0:
#     for token in pad_tokens:
#         print(token[-1])
# #         print(embedding_matrix[token[-1]])
#         break

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int64', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    train_tokens = []
    dev_tokens = []
    train_tags = []
    dev_tags = []
    for i, row in data_group.iterrows():
        if 'train' in row['Split']:
            train_tokens.append(pad_tokens[i])
            train_tags.append(pad_tags[i])
        elif eval_split in row['Split']:
            #dev_idx.append(i)
            dev_tokens.append(pad_tokens[i])
            dev_tags.append(pad_tags[i])

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        #'\ntest_tokens length:', len(test_tokens),
        #'\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(dev_tokens),
        '\nval_tags:', len(dev_tags))
 
    return np.array(train_tokens), np.array(dev_tokens),  np.array(train_tags), np.array(dev_tags)

In [14]:
train_tokens, dev_tokens,  train_tags, dev_tags = get_pad_train_test_val(data_group, data, eval_split= eval_split)

27316
padding 124
train_tokens length: 14041 
train_tokens length: 14041 
val_tokens: 3453 
val_tags: 3453


# Build the Model

In [15]:
input_dim = len(list(set(data['Word'].to_list()))) +1
output_dim = emb_dim # number of dimensions
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', 
      input_dim, '\noutput_dim: ', 
      output_dim, '\ninput_length: ', 
      input_length, '\nn_tags: ', n_tags)
print('emb dim', emb_dim)

input_dim:  27317 
output_dim:  300 
input_length:  124 
n_tags:  9
emb dim 300


In [16]:
def get_bilstm_lstm_model(embedding_matrix, embedding_dim):
    
    model = Sequential()
    #token2idx
    # Add Embedding layer original, trainable
    #model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    print(len(token2idx))
    embedding_layer = Embedding(len(token2idx)+1 ,
                            embedding_dim,
                            weights=[embedding_matrix],
                            # make max sent length a variable
                            input_length=input_length,
                            trainable=False)
    model.add(embedding_layer)

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    # Pia decided to remove this
#     model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    # Pia: replaced relu with sigmoid 
    model.add(TimeDistributed(Dense(n_tags, activation="sigmoid")))

 
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    model.summary()
    
    return model

In [17]:
def train_model(X, y, model):
    loss = list()
    # set epochs to 3 (from 25) (you can change this)
    for i in range(3):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=200, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

# Training the Model

In [18]:
results = pd.DataFrame()
embedding_dim = 300 # dimensions of the word2vec vectors
model_bilstm_lstm = get_bilstm_lstm_model(embedding_matrix, embedding_dim)
plot_model(model_bilstm_lstm)
# change to val_tokens to try out training on val set
results['with_add_lstm'] = train_model(train_tokens, train_tags, model_bilstm_lstm)
#results['with_add_lstm'] = train_model(dev_tokens, dev_tags, model_bilstm_lstm)

27316
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 124, 300)          8195100   
                                                                 
 bidirectional (Bidirectiona  (None, 124, 600)         1442400   
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 124, 9)           5409      
 ibuted)                                                         
                                                                 
Total params: 9,642,909
Trainable params: 1,447,809
Non-trainable params: 8,195,100
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


# Evaluation

In [19]:
# Evaluate the model on the test data using `evaluate`
# Shows accuracy:
# Careful: Really high even if the model 
# only predicts the majority class
print("Evaluate on test data")
# test: test_tokens, test_tags

#print(train_tags[:5])
results = model_bilstm_lstm.evaluate(dev_tokens, np.array(dev_tags), batch_size=1)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.025998517870903015, 0.992666482925415]


# Model Predictions

In [20]:
# Get predictions on development set
y_pred = model_bilstm_lstm.predict(dev_tokens)

# get dimension index with highest prob (--> label)
y_pred = np.argmax(y_pred, axis=-1)
y_dev =  np.argmax(dev_tags, axis=-1)
print(y_pred)

[[4 4 5 ... 4 4 4]
 [2 8 4 ... 4 4 4]
 [4 4 6 ... 4 4 4]
 ...
 [6 4 7 ... 4 4 4]
 [4 4 7 ... 4 4 4]
 [4 4 4 ... 4 4 4]]


In [21]:
# Check if model predicts more than the majority class
pred_labels = []
for tag in y_pred:
    for i in tag:
        label = idx2tag[i]
        #continue
        if label != 'O':
            print(label)
            break

B-MISC
B-PER
B-ORG
B-LOC
B-LOC
B-LOC
B-PER
B-MISC
B-MISC
B-LOC
B-PER
B-LOC
B-PER
B-PER
B-LOC
B-PER
B-LOC
B-MISC
B-LOC
B-LOC
B-MISC
B-LOC
B-LOC
B-LOC
B-PER
B-PER
B-PER
B-PER
B-MISC
B-PER
B-MISC
B-ORG
B-MISC
B-PER
B-MISC
B-PER
B-LOC
B-PER
B-LOC
B-PER
B-PER
B-LOC
B-LOC
B-LOC
B-LOC
B-MISC
B-PER
B-PER
B-PER
B-PER
B-LOC
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-LOC
B-PER
B-MISC
B-ORG
B-MISC
B-LOC
B-LOC
B-LOC
B-LOC
B-PER
B-LOC
B-LOC
B-LOC
B-LOC
B-LOC
B-MISC
B-LOC
B-LOC
B-LOC
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
I-PER
B-LOC
B-PER
B-LOC
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-PER
B-LOC
B-LOC
B-MISC
B-LOC
B-MISC
B-ORG
B-LOC
B-MISC
B-MISC
B-MISC
B-MISC
B-MISC
B-ORG
B-LOC
B-ORG
B-PER
B-LOC
B-ORG
B-PER
B-LOC
B-MISC
B-ORG
B-MISC
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-ORG
B-LOC
B-MISC
B-ORG
B-LOC
B-ORG
B-ORG
B-LOC
B-ORG
B-PER
B-ORG
B-ORG
B-MISC
B-PER
B-LOC
B-PER
B-MI

In [22]:
# Get predictions per token:
# map labels back to tokens

def output_to_file(dev_tokens, y_pred, output_path):
    
    with open(output_path, 'w') as outfile:
        outfile.write('token\tNER\n')
        for token,  preds in zip(dev_tokens, y_pred):
            for tok, pred in zip(token, preds):
                # igonre padding:
                if tok in idx2token:
                    tok_str = idx2token[tok]
                    outfile.write(f'{tok_str}\t{idx2tag[pred]}\n')
    
output_to_file(dev_tokens, y_pred, output_path)   