# import and read csv

In [None]:
%matplotlib inline
import numpy as np
import matplotlib
import pandas as pd

In [None]:
from numpy.random import seed 
seed(7)

import tensorflow as tf 
tf.random.set_seed(7)
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
embeddings_dict = {}

with open("/content/gdrive/MyDrive/QuoraQuestions/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
    f.close()

In [None]:
path = "/content/gdrive/MyDrive/QuoraQuestions/cleaned_features.csv"
train_df = pd.read_csv(path)

In [None]:
train_df

Unnamed: 0,is_duplicate,qid1,qid2,question1_cleaned,question2_cleaned,cosine_similarity,q1_word_count,q2_word_count,q1char_count,q2char_count,freq_qid1,freq_qid2,common_words_count,total_unique_num_words,tot_words,words_ratio,Simple_Ratio,Partial_Ratio,Token_Sort_Ratio,Token_Set_Ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.891528,14,12,65,56,1,1,11,12,23,0.478261,93,100,93,100
1,0,3,4,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...,0.667396,10,15,48,85,4,1,7,17,24,0.291667,65,73,63,86
2,0,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0.499441,14,10,72,58,1,1,4,20,24,0.166667,54,53,66,66
3,0,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,0.165055,11,13,48,59,1,1,0,20,20,0.000000,36,40,36,36
4,0,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.211917,13,7,73,38,3,1,4,16,20,0.200000,45,55,47,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509787,1,235427,267265,should i join hcl tss leap program it is worth...,hcl tss best or not,0.472403,15,5,70,19,6,1,2,18,20,0.100000,36,63,29,54
509788,1,537762,132589,what is your favorite vodka drink and why,what is your favourite vodka,0.941730,8,5,41,28,2,3,4,9,13,0.307692,78,96,78,78
509789,1,537762,132589,what is your favorite vodka drink and why,what is your favourite vodka,0.941730,8,5,41,28,2,3,4,9,13,0.307692,78,96,78,78
509790,1,537894,187745,among bollywood stars which actor or actress d...,who are the over actors of bollywood,0.785160,12,7,70,36,3,2,2,17,19,0.105263,42,53,58,58


# Model A

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def modelA(input_dim, output_dim, embedding_matrix, input_length):
  seq1 = Input(shape=(input_length,))
  seq2 = Input(shape=(input_length,))

  front_model = tf.keras.Sequential()
  front_model.add(Embedding(input_dim = input_dim,
                        output_dim = output_dim,
                        weights = [embedding_matrix],
                        input_length = input_length,
                        trainable=False))

  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128))

  output1 = front_model(seq1)
  output2 = front_model(seq2)

  merged = Multiply()([output1, output2])
  merged = Flatten()(merged)
  merged = Dense(128, activation = 'relu')(merged)
  merged = BatchNormalization()(merged)
  merged = Dropout(0.2)(merged)
  merged = Dense(128, activation= 'relu')(merged)
  merged = BatchNormalization()(merged)
  merged = Dropout(0.2)(merged)
  merged = Dense(1, activation = 'sigmoid')(merged)

  newmodel = Model([seq1, seq2], merged)
  newmodel.compile(optimizer = 'adam', loss = 'binary_crossentropy',
                metrics= ['acc', f1_m, precision_m, recall_m])
  
  return newmodel

#model A
#sentence A -> common lstm -> output A
#sentence B -> common lstm -> output B
#multiply output A and output B and then put through some dense layers and finally a sigmoid function.


In [None]:
def modelB(input_dim, output_dim, embedding_matrix, input_length):
  seq1 = Input(shape=(input_length,))
  seq2 = Input(shape=(input_length,))

  front_model = tf.keras.Sequential()
  front_model.add(Embedding(input_dim = input_dim,
                        output_dim = output_dim,
                        weights = [embedding_matrix],
                        input_length = input_length,
                        trainable=False))

  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128))

  output1 = front_model(seq1)
  output2 = front_model(seq2)

  merged = concatenate([output1, output2])
  merged = Flatten()(merged)
  merged = Dense(128, activation = 'relu')(merged)
  merged = BatchNormalization()(merged)
  merged = Dropout(0.2)(merged)
  merged = Dense(128, activation= 'relu')(merged)
  merged = BatchNormalization()(merged)
  merged = Dropout(0.2)(merged)
  merged = Dense(1, activation = 'sigmoid')(merged)

  newmodel = Model([seq1, seq2], merged)
  newmodel.compile(optimizer = 'adam', loss = 'binary_crossentropy',
                metrics= ['acc',f1_m,precision_m, recall_m])
  
  return newmodel


#model B:
#sentence A -> common lstm -> output A
#sentence B -> common lstm -> output B
#concatenate outputA and outputB and then put through some dense layers and finally a sigmoid function.


In [None]:
def modelC(input_dim, output_dim, embedding_matrix, input_length):
  seq1 = Input(shape=(input_length,))
  seq2 = Input(shape=(input_length,))

  front_model = tf.keras.Sequential()
  front_model.add(Embedding(input_dim = input_dim,
                        output_dim = output_dim,
                        weights = [embedding_matrix],
                        input_length = input_length,
                        trainable=False))

  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128))

  output1 = front_model(seq1)
  output2 = front_model(seq2)

  merged = Multiply()([output1, output2])
  merged = Flatten()(merged)
  merged = Dense(128, activation = 'relu')(merged)
  merged = BatchNormalization()(merged)
  merged = Dropout(0.2)(merged)
  merged = Dense(1, activation = 'sigmoid')(merged)

  newmodel = Model([seq1, seq2], merged)
  newmodel.compile(optimizer = 'adam', loss = 'binary_crossentropy',
                metrics= ['acc', f1_m, precision_m, recall_m])
  
  return newmodel

#model A
#sentence A -> common lstm -> output A
#sentence B -> common lstm -> output B
#multiply output A and output B and then put through some dense layers and finally a sigmoid function.


In [None]:
def modelD(input_dim, output_dim, embedding_matrix, input_length):
  seq1 = Input(shape=(input_length,))
  seq2 = Input(shape=(input_length,))

  front_model = tf.keras.Sequential()
  front_model.add(Embedding(input_dim = input_dim,
                        output_dim = output_dim,
                        weights = [embedding_matrix],
                        input_length = input_length,
                        trainable=False))

  front_model.add(LSTM(128, activation = 'tanh', return_sequences = True))
  front_model.add(Dropout(0.2))
  front_model.add(LSTM(128))

  output1 = front_model(seq1)
  output2 = front_model(seq2)

  def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

  # Calculates the distance as defined by the MaLSTM model
  malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([output1, output2])

  # Pack it all up into a model
  malstm = Model([seq1, seq2], [malstm_distance])

  malstm.compile( optimizer='adam', loss='mean_squared_error', metrics=['acc',f1_m,precision_m, recall_m])

  return malstm

ideas:
change model structure -> add/reduce more lstm layer, add/reduce more dense layers

change word embedding used -> 100 dimension glove embeddings


change loss and metrics to accuracy and binary_crossentropy




In [None]:
input_length = 36
output_dim = 50
MAX_NB_WORDS = 200000

In [None]:
def preprocessing(question1_train_list, question2_train_list, Y_train_list, question1_test_list, question2_test_list, Y_test_list):
  tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
  sentence_list = question1_train_list + question2_train_list
  tokenizer.fit_on_texts(sentence_list)

  X_train_q1 = tokenizer.texts_to_sequences(question1_train_list)
  X_train_q1 = pad_sequences(X_train_q1, maxlen = input_length, padding='post')

  X_train_q2 = tokenizer.texts_to_sequences(question2_train_list)
  X_train_q2 = pad_sequences(X_train_q2, maxlen = input_length, padding='post')

  word_index = tokenizer.word_index
  input_dim = len(word_index)+1
  embedding_matrix = np.random.random((input_dim, output_dim))
  for word, i in word_index.items():
      embedding_vector = embeddings_dict.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
  
  Y_train = np.asarray(Y_train_list)
  Y_test = np.asarray(Y_test_list)
  
  X_test_q1 = tokenizer.texts_to_sequences(question1_test_list)
  X_test_q1 = pad_sequences(X_test_q1, maxlen = input_length, padding='post')

  X_test_q2 = tokenizer.texts_to_sequences(question2_test_list)
  X_test_q2 = pad_sequences(X_test_q2, maxlen = input_length, padding='post')

  return input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test, tokenizer

In [None]:
def train_model(modeltype, batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test):
  if modeltype == 'A':
    new_model = modelA(input_dim, output_dim, embedding_matrix, input_length)
    model_string = '/content/gdrive/MyDrive/QuoraQuestions/modelA/modelA-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5'
  elif modeltype == 'B':
    new_model = modelB(input_dim, output_dim, embedding_matrix, input_length)
    model_string = '/content/gdrive/MyDrive/QuoraQuestions/modelB/modelB-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5'
  elif modeltype == 'C':
    new_model = modelC(input_dim, output_dim, embedding_matrix, input_length)
    model_string = '/content/gdrive/MyDrive/QuoraQuestions/modelC/modelC-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5'
  elif modeltype == 'D':
    new_model = modelD(input_dim, output_dim, embedding_matrix, input_length)
    model_string = '/content/gdrive/MyDrive/QuoraQuestions/modelD/modelD-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5'
  
  print(new_model.summary())

  checkpoint_save = ModelCheckpoint(model_string, verbose=0, monitor='val_acc',save_best_only=True, mode='max')  

  history = new_model.fit([X_train_q1, X_train_q2], Y_train,
                          validation_data=([X_test_q1, X_test_q2], Y_test), 
                          callbacks=[checkpoint_save],
                          batch_size = batch_size, epochs = epochs, shuffle=True)

  results = new_model.evaluate([X_test_q1, X_test_q2], Y_test, verbose=0)

  return new_model, history, results

In [None]:
def fullpipeline(question1_train_list, question2_train_list, Y_train_list, question1_test_list, question2_test_list, Y_test_list, modeltype, batch_size, epochs):
  input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test, my_tokenizer = preprocessing(question1_train_list, question2_train_list, Y_train_list, question1_test_list, question2_test_list, Y_test_list)
  new_model, history, results = train_model(modeltype, batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test)
  return new_model, history, results

# Load data

In [None]:
from sklearn.model_selection import train_test_split
batch_size = 2048
epochs = 35

In [None]:
X_train_q1 = train_df['question1_cleaned'].astype(str).tolist()
X_train_q2 = train_df['question2_cleaned'].astype(str).tolist()
Y_train = train_df['is_duplicate'].astype(int).tolist()

q1_train, q1_val, q2_train, q2_val, y_train, y_val = train_test_split(X_train_q1, X_train_q2, Y_train, test_size = 0.10, random_state=7)

In [None]:
print("number of training samples: ", len(q1_train))
print("number of non-duplicate samples: ", y_train.count(0))
print("number of duplicate samples: ", y_train.count(1))


print("number of training samples: ", len(q1_val))
print("number of non-duplicate samples: ", y_val.count(0))
print("number of duplicate samples: ", y_val.count(1))


number of training samples:  458812
number of non-duplicate samples:  229318
number of duplicate samples:  229494
number of training samples:  50980
number of non-duplicate samples:  25578
number of duplicate samples:  25402


# Preprocess data

In [None]:
input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test, my_tokenizer = preprocessing(q1_train, q2_train, y_train, q1_val, q2_val, y_val)


In [None]:
import pickle
# saving tokenizer
with open('/content/gdrive/MyDrive/QuoraQuestions/mytokenizer.pickle', 'wb') as handle:
    pickle.dump(my_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# training

In [None]:
mymodelA, train_historyA, eval_resultsA = train_model('A', batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 128)          4468616     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
multiply (Multiply)             (None, 128)          0           sequential[0][0]             



Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [None]:
mymodelA.save('/content/gdrive/MyDrive/QuoraQuestions/modelA_last.h5')



In [None]:
mymodelB, train_historyB, eval_resultsB = train_model('B', batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 128)          4468616     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 256)          0           sequential_1[0][0]         

In [None]:
mymodelB.save('/content/gdrive/MyDrive/QuoraQuestions/modelB_last.h5')

In [None]:
mymodelC, train_historyC, eval_resultsC = train_model('C', batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 128)          4337032     input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
multiply_1 (Multiply)           (None, 128)          0           sequential_2[0][0]         



Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [None]:
mymodelC.save('/content/gdrive/MyDrive/QuoraQuestions/modelC_last.h5')



In [None]:
mymodelD, train_historyD, eval_resultsD = train_model('D', batch_size, epochs, input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 36)]         0                                            
__________________________________________________________________________________________________
sequential (Sequential)         (None, 128)          4337032     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 1)            0           sequential[0][0]             

In [None]:
mymodelD.save('/content/gdrive/MyDrive/QuoraQuestions/modelD_last.h5')

# load data

In [None]:
import pickle
with open('/content/gdrive/MyDrive/QuoraQuestions/mytokenizer.pickle', 'rb') as handle:
    loadedtokenizer = pickle.load(handle)

In [None]:
loadedtokenizer.texts_to_sequences(q1_train[0])

In [None]:
def preprocessing_with_loaded_pickle(question1_train_list, question2_train_list, Y_train_list, question1_test_list, question2_test_list, Y_test_list, tokenizer):
  X_train_q1 = tokenizer.texts_to_sequences(question1_train_list)
  X_train_q1 = pad_sequences(X_train_q1, maxlen = input_length, padding='post')

  X_train_q2 = tokenizer.texts_to_sequences(question2_train_list)
  X_train_q2 = pad_sequences(X_train_q2, maxlen = input_length, padding='post')

  word_index = tokenizer.word_index
  input_dim = len(word_index)+1
  embedding_matrix = np.random.random((input_dim, output_dim))
  for word, i in word_index.items():
      embedding_vector = embeddings_dict.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
  
  Y_train = np.asarray(Y_train_list)
  Y_test = np.asarray(Y_test_list)
  
  X_test_q1 = tokenizer.texts_to_sequences(question1_test_list)
  X_test_q1 = pad_sequences(X_test_q1, maxlen = input_length, padding='post')

  X_test_q2 = tokenizer.texts_to_sequences(question2_test_list)
  X_test_q2 = pad_sequences(X_test_q2, maxlen = input_length, padding='post')

  return input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test, tokenizer

In [None]:
input_dim, embedding_matrix, X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test, my_loaded_tokenizer = preprocessing_with_loaded_pickle(q1_train, q2_train, y_train, q1_val, q2_val, y_val, loadedtokenizer)


# testin model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss

In [None]:
modelA_loaded = tf.keras.models.load_model('/content/gdrive/MyDrive/QuoraQuestions/modelA_last.h5', 
                                           custom_objects={'f1_m':f1_m, 'precision_m':precision_m, "recall_m":recall_m})

In [None]:
# compile the model
modelA_loaded.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# evaluate the model
loss, accuracy, f1_score, precision, recall = modelA_loaded.evaluate([X_test_q1, X_test_q2], Y_test)

preds = modelA_loaded.predict([X_test_q1, X_test_q2])
roc_auc = roc_auc_score(Y_test, preds)

print("modelA")
print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 Score: %f' % f1_score)
print('AUC-ROC: %f' % roc_auc)
print('Log Loss: %f' % loss)

modelA
Accuracy: 0.854217
Precision: 0.821111
Recall: 0.904430
F1 Score: 0.857011
AUC-ROC: 0.937873
Log Loss: 0.364571


In [None]:
modelB_loaded = tf.keras.models.load_model('/content/gdrive/MyDrive/QuoraQuestions/modelB_last.h5', 
                                           custom_objects={'f1_m':f1_m, 'precision_m':precision_m, "recall_m":recall_m})

In [None]:
# compile the model
modelB_loaded.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# evaluate the model
loss, accuracy, f1_score, precision, recall = modelB_loaded.evaluate([X_test_q1, X_test_q2], Y_test)

preds_B = modelB_loaded.predict([X_test_q1, X_test_q2])

roc_auc = roc_auc_score(Y_test, preds_B)

print("modelB")
print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 Score: %f' % f1_score)
print('AUC-ROC: %f' % roc_auc)
print('Log Loss: %f' % loss)

modelB
Accuracy: 0.847156
Precision: 0.830691
Recall: 0.870764
F1 Score: 0.846148
AUC-ROC: 0.929831
Log Loss: 0.380886


In [None]:
modelC_loaded = tf.keras.models.load_model('/content/gdrive/MyDrive/QuoraQuestions/modelC_last.h5', 
                                           custom_objects={'f1_m':f1_m, 'precision_m':precision_m, "recall_m":recall_m})
# compile the model
modelC_loaded.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# evaluate the model
loss, accuracy, f1_score, precision, recall = modelC_loaded.evaluate([X_test_q1, X_test_q2], Y_test)

preds_C = modelC_loaded.predict([X_test_q1, X_test_q2])

roc_auc = roc_auc_score(Y_test, preds_C)

print("modelC")
print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 Score: %f' % f1_score)
print('AUC-ROC: %f' % roc_auc)
print('Log Loss: %f' % loss)

modelC
Accuracy: 0.852942
Precision: 0.834753
Recall: 0.879597
F1 Score: 0.852591
AUC-ROC: 0.935025
Log Loss: 0.356903


In [None]:
modelD_loaded = mymodelD

In [None]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = modelD_loaded.evaluate([X_test_q1, X_test_q2], Y_test)

preds_D = modelD_loaded.predict([X_test_q1, X_test_q2])

roc_auc = roc_auc_score(Y_test, preds_D)

print("modelD")
print('Accuracy: %f' % accuracy)
print('Precision: %f' % precision)
print('Recall: %f' % recall)
print('F1 Score: %f' % f1_score)
print('AUC-ROC: %f' % roc_auc)
print('Log Loss: %f' % loss)

modelD
Accuracy: 0.849098
Precision: 0.873697
Recall: 0.814733
F1 Score: 0.838711
AUC-ROC: 0.925007
Log Loss: 0.108170


#Cross Validation

In [None]:
from sklearn.model_selection import KFold

def get_list(ori_list, index_list):
  return [ori_list[i] for i in index_list]

In [None]:
X_train_q1 = train_df['question1_cleaned'].astype(str).tolist()
X_train_q2 = train_df['question2_cleaned'].astype(str).tolist()
Y_train = train_df['is_duplicate'].astype(int).tolist()
kfold = KFold(n_splits=5, shuffle=True)

fold_no = 0
acc_per_fold = []
loss_per_fold = []
for train, test in kfold.split(X_train_q1, X_train_q2, Y_train):

  q1_train = get_list(X_train_q1,train)
  q2_train = get_list(X_train_q2,train)
  y_train = get_list(Y_train,train)

  q1_val = get_list(X_train_q1,test)
  q2_val = get_list(X_train_q2, test)
  y_val = get_list(Y_train, test)

  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  mymodel, train_history, eval_results = train_model(q1_train, q2_train, y_train, q1_val, q2_val, y_val, batch_size, epochs, 'A')

  # Generate generalization metrics
  print(f'Score for fold {fold_no}: {mymodel.metrics_names[0]} of {eval_results[0]}; {mymodel.metrics_names[1]} of {eval_results[1]*100}%')
  acc_per_fold.append(eval_results[1] * 100)
  loss_per_fold.append(eval_results[0])

  # Increase fold number
  fold_no = fold_no + 1


['b' 'c' 'd' 'e']
['a']
['a' 'b' 'c' 'e']
['d']
['a' 'b' 'd' 'e']
['c']
['a' 'b' 'c' 'd']
['e']
['a' 'c' 'd' 'e']
['b']
