In [2]:
import pandas as pd
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
import os

Using TensorFlow backend.


In [3]:
word2TagDict = {}
dictTags = {}
data = []

with open('data.tsv', 'r') as train_data:
    tagLine = train_data.readline()
    while tagLine:
        tokens = tagLine.split()
        if len(tokens) == 6:
            if tokens[0] in dictTags:
                dictTags[tokens[0]][tokens[4]] = tokens[5]
            else:
                dictTags[tokens[0]] = {}
                dictTags[tokens[0]][tokens[4]] = tokens[5]

            word2TagDict[tokens[4]] = tokens[5]
        tagLine = train_data.readline()

In [4]:
len(dictTags)

36910

In [5]:
data = []

for tweet in dictTags:
    sentence = dictTags[tweet]
    listTup = [(k, v) for k, v in sentence.items()]
    data.append(listTup)

In [6]:
len(data)

36910

In [7]:
print(data[0])

[('So', 'O'), ('happy', 'O'), ('I', 'O'), ('get', 'O'), ('to', 'O'), ('go', 'O'), ('home', 'O'), ('in', 'O'), ('two', 'O'), ('days', 'O'), ('☀️🌴', 'O')]


In [9]:
res = ' '.join(token[0] for token in data[0])
res

'So happy I get to go home in two days ☀️🌴'

In [12]:
import emoji
print(emoji.demojize(res, '[emoji]'))

So happy I get to go home in two days [sune[palm_treee


In [7]:
words = list(set(word2TagDict.keys()))

In [8]:
num_words = len(words)

In [9]:
num_words

62515

In [10]:
words = list(set(word2TagDict.keys()))
tags = list(set(word2TagDict.values()))
words.sort()
tags.sort()

In [11]:
num_tags = len(tags)

In [12]:
tags

['B-EVENT',
 'B-GROUP',
 'B-LOC',
 'B-ORG',
 'B-OTHER',
 'B-PER',
 'B-PROD',
 'B-TIME',
 'B-TITLE',
 'I-EVENT',
 'I-GROUP',
 'I-LOC',
 'I-ORG',
 'I-OTHER',
 'I-PER',
 'I-PROD',
 'I-TIME',
 'I-TITLE',
 'O']

In [13]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}



In [14]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in data]

In [15]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)

In [16]:
y = [[tag2idx[w[1]] for w in s] for s in data]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [17]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=num_tags) for i in y]

In [18]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [19]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [20]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='best_cs_ner_model.h5', monitor='val_loss', save_best_only=True)]

In [21]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)  # softmax output layer

In [22]:
model = Model(input, out)

In [23]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [24]:
history = model.fit(X_tr, np.array(y_tr), batch_size=16, epochs=15, callbacks=callbacks, validation_split=0.1, verbose=1)

Train on 29897 samples, validate on 3322 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


In [25]:
from keras.models import load_model
model = load_model('best_cs_ner_model.h5')

In [26]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X_te, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)



In [27]:
print(classification_report(test_labels, pred_labels))

             precision    recall  f1-score   support

        PER       0.45      0.46      0.45       342
        LOC       0.45      0.48      0.46       181
       PROD       0.73      0.17      0.27        96
      TITLE       0.00      0.00      0.00        68
      GROUP       0.00      0.00      0.00        65
       TIME       0.00      0.00      0.00        47
        ORG       0.00      0.00      0.00        65
      OTHER       0.00      0.00      0.00        33
      EVENT       0.00      0.00      0.00        15

avg / total       0.33      0.29      0.29       912

