In [None]:
%matplotlib inline
import numpy as np
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

from numpy.random import seed 
seed(7)

import tensorflow as tf 
tf.random.set_seed(7)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K


In [None]:
# constants dont change
input_length = 36
output_dim = 50
MAX_NB_WORDS = 200000

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
embeddings_dict = {}

with open("/content/gdrive/MyDrive/QuoraQuestions/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
    f.close()

with open('/content/gdrive/MyDrive/QuoraQuestions/mytokenizer.pickle', 'rb') as handle:
    loadedtokenizer = pickle.load(handle)

path = "/content/gdrive/MyDrive/QuoraQuestions/cleaned_features.csv"
train_df = pd.read_csv(path)

X_train_q1 = train_df['question1_cleaned'].astype(str).tolist()
X_train_q2 = train_df['question2_cleaned'].astype(str).tolist()
Y_list = train_df['is_duplicate'].astype(int).tolist()

q1_train, q1_val, q2_train, q2_val, y_train, y_val = train_test_split(X_train_q1, X_train_q2, Y_list, test_size = 0.10, random_state=7)


In [None]:
def preprocessing_loaded_tokenizer_testtrain(question1_train_list, question2_train_list, Y_train_list, question1_test_list, question2_test_list, Y_test_list, tokenizer):
  X_train_q1 = tokenizer.texts_to_sequences(question1_train_list)
  X_train_q1 = pad_sequences(X_train_q1, maxlen = input_length, padding='post')

  X_train_q2 = tokenizer.texts_to_sequences(question2_train_list)
  X_train_q2 = pad_sequences(X_train_q2, maxlen = input_length, padding='post')
  
  Y_train = np.asarray(Y_train_list)
  Y_test = np.asarray(Y_test_list)
  
  X_test_q1 = tokenizer.texts_to_sequences(question1_test_list)
  X_test_q1 = pad_sequences(X_test_q1, maxlen = input_length, padding='post')

  X_test_q2 = tokenizer.texts_to_sequences(question2_test_list)
  X_test_q2 = pad_sequences(X_test_q2, maxlen = input_length, padding='post')

  return X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test

def preprocessing_with_loaded_tokenizer(question1_list, question2_list, Y_list, tokenizer):
  X_q1 = tokenizer.texts_to_sequences(question1_list)
  X_q1 = pad_sequences(X_q1, maxlen = input_length, padding='post')

  X_q2 = tokenizer.texts_to_sequences(question2_list)
  X_q2 = pad_sequences(X_q2, maxlen = input_length, padding='post')
  
  Y_list_np = np.asarray(Y_list)

  return X_q1, X_q2, Y_list_np


In [None]:
X_train_q1, X_train_q2, Y_train, X_test_q1, X_test_q2, Y_test = preprocessing_loaded_tokenizer_testtrain(q1_train, q2_train, y_train, q1_val, q2_val, y_val, loadedtokenizer)


In [None]:
X_test_q2[0]

array([   2, 1202,   10,   67,   19,   42,   30,  186,  260,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [None]:
train_df = pd.read_csv(path)

X_train_q1 = train_df['question1_cleaned'].astype(str).tolist()
X_train_q2 = train_df['question2_cleaned'].astype(str).tolist()
Y_list = train_df['is_duplicate'].astype(int).tolist()

X1, X2, Y = preprocessing_with_loaded_tokenizer(X_train_q1, X_train_q2, Y_list, loadedtokenizer)

In [None]:
modelA_loaded = tf.keras.models.load_model('/content/gdrive/MyDrive/QuoraQuestions/modelA_last.h5', 
                                           custom_objects={'f1_m':f1_m, 'precision_m':precision_m, "recall_m":recall_m})

# compile the model
modelA_loaded.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

In [None]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = modelA_loaded.evaluate([X_test_q1, X_test_q2], Y_test)

# raw model prediction
preds = modelA_loaded.predict([X_test_q1, X_test_q2])



In [None]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = modelA_loaded.evaluate([X1, X2], Y)

# raw model prediction
preds = modelA_loaded.predict([X1, X2])



#lime

In [None]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l[K     |█▏                              | 10 kB 26.5 MB/s eta 0:00:01[K     |██▍                             | 20 kB 10.0 MB/s eta 0:00:01[K     |███▋                            | 30 kB 8.1 MB/s eta 0:00:01[K     |████▊                           | 40 kB 8.1 MB/s eta 0:00:01[K     |██████                          | 51 kB 5.4 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 5.9 MB/s eta 0:00:01[K     |████████▎                       | 71 kB 5.4 MB/s eta 0:00:01[K     |█████████▌                      | 81 kB 5.9 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 6.2 MB/s eta 0:00:01[K     |███████████▉                    | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████████                   | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████▎                 | 122 kB 5.3 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 5.3 MB/s eta 0:00:01[K     |█████████

In [None]:
from lime.lime_text import LimeTextExplainer
class_names=['not_duplicate','duplicate']
explainer= LimeTextExplainer(class_names=class_names)

def predict_proba(sentence1):
  processed_one=[]
  for i in sentence1:
    processed_one.append(i)

  sentence2 = train_df['question2_cleaned'][0] #sentence_list[1]
  processed_two = []
  for i in sentence2:
    processed_two.append(i)

  list_tokenized_1 = loadedtokenizer.texts_to_sequences(processed_one)
  text1 = pad_sequences(list_tokenized_1, maxlen = input_length, padding='post')

  list_tokenized_2= loadedtokenizer.texts_to_sequences(processed_two)
  text2 = pad_sequences(list_tokenized_2,maxlen = input_length, padding='post')

  print(text1)

  pred = modelA_loaded.predict([text1, text2])
  returnable=[]
  for i in pred:
    temp=i[0]
    returnable.append(np.array([1-temp,temp])) #I would recommend rounding temp and 1-temp off to 2 places
  return np.array(returnable)

predict_proba([train_df['question1_cleaned'][0], train_df['question2_cleaned'][0]])

[[   2    3    1  849   56  849 2153    6  679    8  862  373    8   38
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]
 [   2    3    1  849   56  849 2153    6  679    8  862  373    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]


ValueError: ignored

In [None]:
#mydata = [train_df['question1_cleaned'][0],train_df['question2_cleaned'][0]]
explainer.explain_instance(train_df['question1_cleaned'][0],predict_proba).show_in_notebook(text=True)


ValueError: ignored