# LSTM for Named Entities

In [1]:
# libraries

import numpy as np
import pandas as pd
from itertools import chain
import tensorflow

# model
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, Model, Input, optimizers
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from gensim.models import KeyedVectors

In [2]:
# Setting path to data and embeddings

# conll data
path_train ='./datas/conll2003.train.conll' #adapt
path_eval = './datas/conll2003.dev.conll' # adapt
paths = [path_train, path_eval]

# change to test if you are evaluating on test:
eval_split = 'dev'
# model output path
output_path = './datas/lsmt-out.csv' # adapt

# embedding model
path_emb = './models/GoogleNews-vectors-negative300.bin.gz'

# Data Preparation

In [10]:
def convert_data(paths):
    
    data = []
    sent_id = 1
    for path in paths:
        split = path.split('.')[-2]
        with open(path) as infile:
            lines = infile.read().split('\n')
        for n, line in enumerate(lines):
            ll = line.split('\t')
            if len(ll) > 2:
                d = dict()
                d['Sentence #'] = f'Sentence: {sent_id}'
                d['Word'] = ll[0]
                d['POS'] = ll[1]
                d['Tag'] = ll[-1]
                d['Split'] = split
                data.append(d)

            else:
                sent_id += 1
    data = pd.DataFrame(data)
    return data

In [13]:
data = convert_data(paths)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Split
0,Sentence: 1,EU,NNP,B-ORG,train
1,Sentence: 1,rejects,VBZ,O,train
2,Sentence: 1,German,JJ,B-MISC,train
3,Sentence: 1,call,NN,O,train
4,Sentence: 1,to,TO,O,train


In [14]:
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag,Split
254978,Sentence: 17291,.,.,O,dev
254979,Sentence: 17292,--,:,O,dev
254980,Sentence: 17292,Dhaka,NNP,B-ORG,dev
254981,Sentence: 17292,Newsroom,NNP,I-ORG,dev
254982,Sentence: 17292,880-2-506363,CD,O,dev


In [15]:
# mapping tokens and labels to indices

def get_dict_map(data, token_or_tag, embedding_model=None):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}   
    
    return tok2idx, idx2tok

In [16]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
print(len(token2idx))
print(len(tag2idx))

26883
9


# Integrating Embeddings

In [17]:
# Load embedding model
w2v_model = KeyedVectors.load_word2vec_format(path_emb, binary=True)

In [18]:
# Create embedding matrix with zero vectors for oov words
emb_dim = 300
embedding_matrix = np.zeros((len(token2idx) + 1, emb_dim))
print(embedding_matrix.shape)
for word, i in token2idx.items():
    # You may have to change the following line to:
    # if word in w2v_model:
    if word in w2v_model.key_to_index:
        embedding_vector = w2v_model[word]
    else:
        embedding_vector = None
        # If you want to check OOV words:
        #print('couldnt find:', word, i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

(26884, 300)
