In [11]:
import pandas as pd
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
import os


In [31]:
word2TagDict = {}

dictTags = {}
data = []

with open('train.data', 'r') as train_data:
    tagLine = train_data.readline()
    while tagLine:
        if len(tagLine.strip()) == 0:
            listTup = [(k, v) for k, v in dictTags.items()]
            data.append(listTup)
            dictTags = {}
        else:
            tokens = tagLine.split()
            dictTags[tokens[0]] = tokens[3]
            word2TagDict[tokens[0]] = tokens[3]
        
        tagLine = train_data.readline()

In [32]:
len(data)


14985

In [36]:
num_words = len(set(word2TagDict.keys()))
num_tags = len(set(word2TagDict.values()))

In [42]:
words = list(set(word2TagDict.keys()))
tags = list(set(word2TagDict.values()))
words.sort()
tags.sort()

In [50]:
tags

['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

In [43]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [47]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in data]

In [49]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)

In [52]:
y = [[tag2idx[w[1]] for w in s] for s in data]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [53]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [54]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [55]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [71]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=num_words + 1, output_dim=50,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(num_tags)  # CRF layer
out = crf(model)  # output

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=50,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [72]:
model = Model(input, out)

In [73]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [74]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

In [67]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 75, 20)            495900    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 75, 100)           28400     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_2 (CRF)                  (None, 75, 8)             488       
Total params: 529,838
Trainable params: 529,838
Non-trainable params: 0
_________________________________________________________________


In [78]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=50, validation_split=0.1, verbose=1)

Train on 12137 samples, validate on 1349 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


In [79]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X_te, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)



In [80]:
print(classification_report(test_labels, pred_labels))

             precision    recall  f1-score   support

        ORG       0.72      0.76      0.74       632
        LOC       0.84      0.87      0.85       727
        PER       0.75      0.80      0.78       640
       MISC       0.50      0.71      0.58       340

avg / total       0.73      0.80      0.76      2339

