In [None]:
%tensorflow_version 1.x #use tensorflow magic to use version 1.x in colab

TensorFlow 1.x selected.


In [None]:
#imports
import pandas as pd
from keras.preprocessing import text as keras_text, sequence as keras_seq
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import np_utils

from keras.layers import *
from keras.models import *
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.initializers import *
from keras.optimizers import *
import keras.backend as K
from keras.callbacks import *
import tensorflow as tf

Using TensorFlow backend.


In [None]:
train=pd.read_csv("/content/drive/My Drive/minor/english_dataset/eng1train.csv") #traning data
test=pd.read_csv("/content/drive/My Drive/minor/english_dataset/eng1test.csv")  #testing data

In [None]:
train_X=train['text']  #training text
test_X=test['text']   #testing text

In [None]:
embed_size = 200 # how big is each word vector
maxlen = 70 # max number of words in a question to use

In [None]:

## Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_X)+list(test_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)
word_index = tokenizer.word_index

In [None]:
max_features = len(word_index) # how many unique words to use (i.e num rows in embedding vector)

In [None]:
train_X =pad_sequences(train_X, maxlen=maxlen)  #padding training text to length=70
test_X = pad_sequences(test_X, maxlen=maxlen)  #padding testing text to length=70
train_Y=train['task_1']    #training label
test_Y=test['task_1']       #testing label
train_Y=np_utils.to_categorical(train_Y)   #one-hot encoded training label
test_Y=np_utils.to_categorical(test_Y)   #one-hot encoded testing label

In [None]:
def load_glove(word_index):
    EMBEDDING_FILE = '/content/drive/My Drive/minor/glove/glove.twitter.27B.200d.txt'   #glove file
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))     #open glove embediing file
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197   #embedding mean, standard deviation for intializing not found vectors
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word) #get vector for ith word
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())  #if vector for lowercase notfound then try for uppercase
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix 

In [None]:
embedding_matrix = load_glove(word_index)  #create embedding matrix using glove

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5] #varous filter sizes
    num_filters = 36

    inp = Input(shape=(maxlen,))  #input layer
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x) #reshape to make 3d data for conv2d layers.

    maxpool_pool = [] #list of layers using different filter sizes
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='relu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   #concat results of all filter sizes
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(2, activation="sigmoid")(z) #output layer

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    print(model.summary())
    
    return model

In [None]:
def model_lstm_du(embedding_matrix):
    inp = Input(shape=(maxlen,))  #input layer
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    '''
    Here 64 is the size(dim) of the hidden state vector as well as the output vector. Keeping return_sequence we want the output for the entire sequence. So what is the dimension of output for this layer?
        64*70(maxlen)*2(bidirection concat)
    CuDNNLSTM is fast implementation of LSTM layer in Keras which only runs on GPU
    '''
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool]) #concat the two poolings
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(2, activation="sigmoid")(conc) #output layer
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
def model_gru_du(embedding_matrix):
    inp = Input(shape=(maxlen,)) #input layer
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    '''
    Here 64 is the size(dim) of the hidden state vector as well as the output vector. Keeping return_sequence we want the output for the entire sequence. So what is the dimension of output for this layer?
        64*70(maxlen)*2(bidirection concat)
    CuDNNLSTM is fast implementation of LSTM layer in Keras which only runs on GPU
    '''
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool]) #concat the ouput of two pooling layers
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(2, activation="sigmoid")(conc) #output layer
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
model1=model_cnn(embedding_matrix) #create cnn model

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 70)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 70, 200)      4919800     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 70, 200, 1)   0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 70, 1, 36)    7236        reshape_1[0][0]                  
__________

In [None]:
model2=model_lstm_du(embedding_matrix) #create bilstm model

In [None]:
model3=model_gru_du(embedding_matrix) #create bigru model

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_X,train_Y, test_size=0.15, random_state=42) #splitting training and validation data

In [None]:
cp1=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history1=model1.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=7, batch_size=32,callbacks=[cp1]) #training cnn with checkpoint

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 4974 samples, validate on 878 samples
Epoch 1/7

Epoch 00001: val_acc improved from -inf to 0.63554, saving model to model_cnn.hdf5
Epoch 2/7

Epoch 00002: val_acc improved from 0.63554 to 0.65831, saving model to model_cnn.hdf5
Epoch 3/7

Epoch 00003: val_acc improved from 0.65831 to 0.66515, saving model to model_cnn.hdf5
Epoch 4/7

Epoch 00004: val_acc did not improve from 0.66515
Epoch 5/7

Epoch 00005: val_acc improved from 0.66515 to 0.66629, saving model to model_cnn.hdf5
Epoch 6/7

Epoch 00006: val_acc improved from 0.66629 to 0.67882, saving model to model_cnn.hdf5
Epoch 7/7

Epoch 00007: val_acc did not improve from 0.67882


In [None]:
cp2=ModelCheckpoint('model_lstm.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)
history2=model2.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=7, batch_size=32,callbacks=[cp2]) #training bilstm with checkpoint

Train on 4974 samples, validate on 878 samples
Epoch 1/7

Epoch 00001: val_accuracy improved from -inf to 0.69134, saving model to model_lstm.hdf5
Epoch 2/7

Epoch 00002: val_accuracy improved from 0.69134 to 0.69704, saving model to model_lstm.hdf5
Epoch 3/7

Epoch 00003: val_accuracy did not improve from 0.69704
Epoch 4/7

Epoch 00004: val_accuracy did not improve from 0.69704
Epoch 5/7

Epoch 00005: val_accuracy did not improve from 0.69704
Epoch 6/7

Epoch 00006: val_accuracy did not improve from 0.69704
Epoch 7/7

Epoch 00007: val_accuracy did not improve from 0.69704


In [None]:
cp3=ModelCheckpoint('model_gru.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)
history3=model3.fit(x_train, y_train, validation_data=(x_test, y_test),epochs=7, batch_size=32,callbacks=[cp3]) #training bigru with checkpoint

Train on 4974 samples, validate on 878 samples
Epoch 1/7

Epoch 00001: val_accuracy improved from -inf to 0.65945, saving model to model_gru.hdf5
Epoch 2/7

Epoch 00002: val_accuracy improved from 0.65945 to 0.69248, saving model to model_gru.hdf5
Epoch 3/7

Epoch 00003: val_accuracy did not improve from 0.69248
Epoch 4/7

Epoch 00004: val_accuracy did not improve from 0.69248
Epoch 5/7

Epoch 00005: val_accuracy did not improve from 0.69248
Epoch 6/7

Epoch 00006: val_accuracy did not improve from 0.69248
Epoch 7/7

Epoch 00007: val_accuracy did not improve from 0.69248


In [None]:
pred_y1=model1.predict(test_X) #predict on testing data for cnn
pred_y2=model2.predict(test_X)  #predict on testing data for bilstm
pred_y3=model3.predict(test_X)  #predict on testing data for bigru
pred_y4=(pred_y1+pred_y2+pred_y3)/3   #predict on testing data on basis of avg of probablities of above
pred_y1=np.argmax(pred_y1,axis=1)
pred_y2=np.argmax(pred_y2,axis=1)
pred_y3=np.argmax(pred_y3,axis=1)
pred_y4=np.argmax(pred_y4,axis=1)
pred_y5=(pred_y1+pred_y2+pred_y3)/3   #predict what majority among cnn,gru,lstm say
pred_y5=np.round(pred_y5)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print("METRICS FOR TESTING DATA")
print("CNN")
print(classification_report(test['task_1'],pred_y1))
print("BiLSTM")
print(classification_report(test['task_1'],pred_y2))
print("BiGRU")
print(classification_report(test['task_1'],pred_y3))
print("HYBRID By Probablity")
print(classification_report(test['task_1'],pred_y4))
print("HYBRID By Vote")
print(classification_report(test['task_1'],pred_y5))

METRICS FOR TESTING DATA
CNN
              precision    recall  f1-score   support

           0       0.44      0.77      0.56       288
           1       0.90      0.67      0.77       865

    accuracy                           0.69      1153
   macro avg       0.67      0.72      0.66      1153
weighted avg       0.78      0.69      0.71      1153

BiLSTM
              precision    recall  f1-score   support

           0       0.44      0.64      0.52       288
           1       0.86      0.74      0.79       865

    accuracy                           0.71      1153
   macro avg       0.65      0.69      0.66      1153
weighted avg       0.75      0.71      0.72      1153

BiGRU
              precision    recall  f1-score   support

           0       0.42      0.80      0.55       288
           1       0.90      0.63      0.75       865

    accuracy                           0.68      1153
   macro avg       0.66      0.72      0.65      1153
weighted avg       0.78      0.6