## Models

- BiLSTM
- BiLSTM CRF


In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, SimpleRNN, Flatten,\
Activation, RepeatVector, Permute, merge, Lambda
from keras_contrib.layers import CRF
import keras.optimizers as ko
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import keras
import subprocess
from sklearn.utils import class_weight

In [2]:
data = pd.read_csv('Task1.csv')
data = data.rename(columns={'id':'Sentence #'})
data = data.drop('Unnamed: 0',axis=1)
data = data.fillna(method="ffill")

In [3]:
data.groupby('labels').count()

Unnamed: 0_level_0,Sentence #,words
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
B_EXC,1176,1176
B_INC,1223,1223
EXC,5713,5713
INC,5455,5455
O,29976,29976


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["words"].values.tolist(),
                                                           s["labels"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

        
words = list(set(data["words"].values))
tags = ['O','B_INC','INC','B_EXC','EXC']
# tags = list(set(data["labels"].values))
n_words = len(words)
n_tags = len(tags)

getter = SentenceGetter(data)
sentences = getter.sentences

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 170
X = [[word2idx[w[0]] for w in s] for s in sentences]    
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)
y = [np.array([tag2idx[w[1]] for w in s]) for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = np.array([to_categorical(i, num_classes=n_tags) for i in y])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
len(sentences)

2154

In [7]:
# Create embedding weight matrix

def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

vec_model = loadGloveModel('glove/glove.6B.200d.txt')

Loading Glove Model
400000  words loaded!


In [8]:
emb_dim = len(vec_model['the'])
embedding_matrix = np.zeros((len(word2idx) + 1, emb_dim))

for word, i in word2idx.items():
    if word not in vec_model:
        continue
    embedding_vector = vec_model[word]
    embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(4916, 200)


In [9]:
vec_model.clear()

In [10]:
def write_results(result,file,ign):
    idx2tag = {i: w for w, i in tag2idx.items()}
    with open(file,'w+') as f:
        for i,lis in enumerate(result):
            line = ""
            for el in lis:
                tag = idx2tag[el]
                if tag in ['O',ign,'B_'+ign]:
                    line += "O "
                elif tag[0] == 'B':
                    line += 'B '
                else:
                    line += 'I '
            f.write(line+'\n')
            
def get_sampleWeights(class_weights):
    sample_weights=np.random.rand(X_train.shape[0], X_train.shape[1])
    for i in range(X_train.shape[0]):
        for j in range(X_train.shape[1]):
            sample_weights[i][j]=class_weights[np.argmax(y_train[i][j])]
    return sample_weights

def get_softMetrics(pred,labels,ign):

    write_results(pred,'pred.txt',ign)
    write_results(labels,'labels.txt',ign)
    out = subprocess.check_output(['./a.out']).decode('utf-8').split('\n')
    rows = []
    for item in  out:
        item = item.split(" ")
        if len(item) != 2:continue 
        rows.append({'Proportional':item[0],'Binary':item[1]})
    
    return (pd.DataFrame(rows,index=['precision','recall','F1']))

def return_report(model,epochs):
    y_flat = list(np.argmax(y_test,2).flatten('F'))
#     class_weights = class_weight.compute_class_weight('balanced',y_flat)
    class_weights = [1,20,20,20,20]
    model.fit(X_train,y_train,epochs=epochs,verbose=1)
    out = model.predict(X_test)
    pred = np.argmax(out,2)
    labels = np.argmax(y_test,2)
    inc = get_softMetrics(pred,labels,'INC')
    exc = get_softMetrics(pred,labels,'EXC')
    display(inc)
    display(exc)
    report = classification_report(np.argmax(y_test,2).flatten('F'),pred.flatten('F'),output_dict=True)
    df = pd.DataFrame(report).transpose()
#     display(df)
    return pred
#     return report

In [11]:
def loss(y_true, y_pred):
    X = crf.input
    mask = crf.input_mask
    nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
    return keras.activations.relu(nloglik)

def lstm_crf():
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
                      input_length=max_len, weights=[embedding_matrix],trainable=True)(input)  # 20-dim embedding
    model = Bidirectional(LSTM(units=emb_dim, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    
    model = TimeDistributed(Dense(2*emb_dim, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags,sparse_target=False)  # CRF layer
    out = crf(model)  # output

    adam = ko.Adam(lr=0.0008)
    sgd = ko.SGD(lr=0.05,momentum=0.7)
    rmsprop = ko.RMSprop(lr=0.001)
    model = Model(input, out)
    model.compile(optimizer='rmsprop', loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model

def get_bilstm_lstm_model():
    
    input = Input(shape=(max_len,))

    # Add Embedding layer
    model = Embedding(input_dim=n_words + 1, output_dim=emb_dim,
                  input_length=max_len, weights=[embedding_matrix],trainable=True)(input)

    # Add bidirectional LSTM
    model = Bidirectional(LSTM(units=emb_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.1))(model)

    # Add LSTM
    model = Bidirectional(LSTM(units=emb_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.1))(model)
    
    # Add timeDistributed Layer
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

    #Optimiser 
    adam = ko.Adam(lr=0.0009)

    # Compile model
    model = Model(input, out)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()
    
    return model

In [None]:
bilstm = get_bilstm_lstm_model()
labels = return_report(bilstm,1)

In [37]:
crf = lstm_crf()
pred = return_report(crf,25)



Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 170)               0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 170, 300)          1474800   
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 170, 600)          1442400   
_________________________________________________________________
time_distributed_12 (TimeDis (None, 170, 600)          360600    
_________________________________________________________________
crf_12 (CRF)                 (None, 170, 5)            3040      
Total params: 3,280,840
Trainable params: 3,280,840
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epo

Unnamed: 0,Proportional,Binary
precision,0.473679,0.60793
recall,0.606245,0.690476
F1,0.531826,0.646579


Unnamed: 0,Proportional,Binary
precision,0.318234,0.401575
recall,0.664627,0.774436
F1,0.43039,0.528896
