<a href="https://colab.research.google.com/github/mehrseno/RetrievalBased-Chatbot/blob/main/RetrievalBased_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna
!pip install transformers
!pip install SentencePiece

In [None]:
import tensorflow as tf

# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
import pandas as pd
import numpy as np
import re
from transformers import *
import random

In [None]:
# train
!gdown --id 1lct2GyNPE2UwI8geGXRha6j1yU8tksSw

# valid
!gdown --id 13wDQLk8mXorxghxWFNfhWvXPWPPI27dL

# test
!gdown --id 1St87-nfaqT5ZyiaDRY_NPy8PbRdTEPAY

In [None]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')
test = pd.read_csv('test.csv')

In [None]:
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 64

In [None]:
def prepare_dataset(data, tokenizer, max_len):

  ids = []
  masks = []
  labels = []
  token_ids = []

  labels = []
  
  for row in data.itertuples():
    text = row.Input + ' [SEP] ' + row.Output
    inputs = tokenizer(text, None, max_length=max_len, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True)
    ids.append(inputs['input_ids'])
    masks.append(inputs['attention_mask'])
    token_ids.append(inputs['token_type_ids'])

    labels.append(row.Label)

  return {
      'input_ids': ids,
      'attention_mask': masks,
      'token_type_ids': token_ids,
      'labels': labels
      }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

training_set = prepare_dataset(train, tokenizer, MAX_LEN)
validating_set = prepare_dataset(valid, tokenizer, MAX_LEN)
testting_set = prepare_dataset(test, tokenizer, MAX_LEN)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import * 
from tensorflow.keras import optimizers, losses, layers
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt

In [None]:
def get_model(MAX_LEN, MODEL_NAME):

  input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32)
  attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32)
  token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32)
    
  encoder = TFAutoModel.from_pretrained(MODEL_NAME, return_dict=True)

  pooler_outputs = encoder({"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}, training=True).pooler_output

  dense_layer = Dense(256, activation='relu', name='dense_layer1')(pooler_outputs)
  dropout = Dropout(0.25)(dense_layer)
  output = Dense(1, activation='sigmoid')(dropout)
    
  model = Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)

  optimizer = optimizers.Adam(lr=2e-5)
  loss = losses.binary_crossentropy

  model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
  
  return model

In [None]:
with strategy.scope():
  model = get_model(MAX_LEN, MODEL_NAME)

model.summary()

tf.keras.utils.plot_model(model, to_file='./model.png', show_shapes=True)

In [None]:
x_train = [np.array(training_set['input_ids'], dtype=int), np.array(training_set['attention_mask'], dtype=int), np.array(training_set['token_type_ids'], dtype=int)]
x_valid = [np.array(validating_set['input_ids'], dtype=int), np.array(validating_set['attention_mask'], dtype=int), np.array(validating_set['token_type_ids'], dtype=int)]
x_test = [np.array(testting_set['input_ids'], dtype=int), np.array(testting_set['attention_mask'], dtype=int), np.array(testting_set['token_type_ids'], dtype=int)]

y_train = train.Label
y_valid = valid.Label
y_test = test.Label

In [None]:
history = model.fit(
    x_train,
    y_train,
    validation_data = (x_valid, y_valid),
    epochs=3,
    verbose=1,
    batch_size=32)

In [None]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(x_test)

In [None]:
import optuna

def objective(trial):

  sig_threshold = trial.suggest_float("sig_threshold", 0.0, 1.0)
  print("sig_threshold", sig_threshold)

  pred_labels = [1 if predictions[i] > sig_threshold else 0 for i in range(len(testting_set['labels']))]

  acc = accuracy_score(testting_set['labels'], pred_labels)
  print("acc", acc)

  return acc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
best_sig_threshold = study.best_value  

pred_labels = [1 if predictions[i] > best_sig_threshold else 0 for i in range(len(testting_set['labels']))]

print('Test Accuracy:', accuracy_score(testting_set['labels'], pred_labels))

In [None]:
best_sig_threshold

In [None]:
def find_group_of_questions(questions, labels):
  
  begins = []
  ends = []
  tag = []
  i = 0
  begins.append(i)
  curr_ques = ''

  for index in range(len(questions)):
        question = questions[index]
        tag.append(int(labels[index]))
      
        if i == 0:
          curr_ques = question       
        
        if curr_ques != question:
          curr_ques = question
          ends.append(i)
          begins.append(i)

        i += 1
  ends.append(i)

  return begins, ends, tag

In [None]:
def calculate_measures(predictions, labels):

  predictions, labels = zip(*sorted(zip(predictions, labels), reverse=True))

  rank = 0.0

  for i in range(0, len(labels)):
            
    if labels[i] == 1:
      if rank == 0.0:
        rank = 1 / (i + 1)
  
  return rank

In [None]:
def calculate_MAP_and_MRR(begins, ends, labels, predictions) :

  tmp = []

  for pred in predictions:
    tmp.append(pred)

  RR_s = []

  for i in range (0, len(begins)):
    begin = begins[i]
    end = ends[i]

    RR = calculate_measures(tmp[begin:end + 1], labels[begin:end + 1])

    RR_s.append(RR)

  return np.mean(RR_s)

In [None]:
sorted_test = test.sort_values('Input')

In [None]:
begins, ends, labels = find_group_of_questions(sorted_test.Input, sorted_test.Label)
MRR = calculate_MAP_and_MRR(begins, ends, labels, predictions)
print("MRR:", MRR)

MRR: 0.740077157059129


In [None]:
def prepare_test_dataset(data, tokenizer, max_len):

  ids = []
  masks = []
  labels = []
  token_ids = []
  
  for row in data.itertuples():
    text = row.Input + ' [SEP] ' + row.Output
    inputs = tokenizer(text, None, max_length=max_len, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=True)
    ids.append(inputs['input_ids'])
    masks.append(inputs['attention_mask'])
    token_ids.append(inputs['token_type_ids'])

  return {
      'input_ids': ids,
      'attention_mask': masks,
      'token_type_ids': token_ids,
      }

In [None]:
extra_test = pd.DataFrame([['what do you do for your weekend?', 'i am not quite sure yet . i like to go shopping.'],
                           ['i am not quite sure yet . i like to go shopping.', 'well , i am not sure what you are doing.'],
                           ['well , i am not sure what you are doing.', 'i am not sure . i am not quite sure yet.'], 
                           ['i am not sure . i am not quite sure yet.', 'well , i am not sure if you are interested , you would not have been fired.'],
                           ['well , i am not sure if you are interested , you would not have been fired.', 'i am sorry , but i cannot . i have got a sore throat.'],
                           ['i am sorry , but i cannot . i have got a sore throat.', 'oh , i see . but i have never been here before . i have been looking for a long time . it is really a nice neighborhood here.'],
                           ['oh , i see . but i have never been here before . i have been looking for a long time . it is really a nice neighborhood here.', 'i really appreciate your help.'],
                           ['i really appreciate your help.', 'thank you . i really appreciate your help.'],
                           ['thank you . i really appreciate your help.', 'you are welcome . i hope you will enjoy your rest and hope to make up.'], 
                           ['you are welcome . i hope you will enjoy your rest and hope to make up.', 'thank you . i will try to keep it to my li.'],
                           ['how are you?', 'Well, I love going to the cinema.'],
                           ['what do you want to eat?', 'I want to drink coffee.'],
                           ['what is your favorite color?', 'study.'],
                           ['what is your favorite color?', 'pink.'],
                           ['Where have you been?', 'i lived in england when i was at the conference . i always wanted to go out dancing.'],
                           ['Do you consider yourself a good mother?', 'Yes , I am a very good mother and successful career woman.'],
                           ['Yes , I am a very good mother and successful career woman.', 'How can you manage to do both ?'],
                           ['How can you manage to do both ?', 'I have a good manager.'],
                           ['I have a good manager.', 'How was your first day at work?'],
                           ['How was your first day at work?', 'I do not know.'],
                           ['I do not know.', 'great.'],
                           ['great.', 'I think the most important thing is that you should be nice to yourself.'],
                           ['I think the most important thing is that you should be nice to yourself.', 'Yeah, that is right.'],
                           ['Yeah, that is right.', 'You can only be nice to others when you can be nice to yourself.'],
                           ['You can only be nice to others when you can be nice to yourself.', 'I have two nice sisters.'],
                           ['I have two nice sisters.', 'Is he very understanding and supportive?']], 
                          columns=["Input", "Output"])

In [None]:
extra_testting_set = prepare_test_dataset(extra_test, tokenizer, MAX_LEN)
extra_prepared_test = [np.array(extra_testting_set['input_ids'], dtype=int), np.array(extra_testting_set['attention_mask'], dtype=int), np.array(extra_testting_set['token_type_ids'], dtype=int)]

predictions = model.predict(extra_prepared_test)

pred_labels = [1 if predictions[i] > best_sig_threshold else 0 for i in range(len(predictions))]

In [None]:
for index, row in extra_test.iterrows():

  print("Input:", row.Input)
  print("Output:", row.Output)

  if pred_labels[index]:
    print("IsNext.")
  else:
    print("NotNext")

  print()

In [None]:
# search space
!gdown --id 1TtUSvIUjIF7mz49ZVTw7lt2fXQ726vaa

In [None]:
test_inp = ["how are you?", "what do you want to eat?", "what is your favorite color?", 
            "where have you been?", "do you consider yourself a good mother?", "I have two nice sisters."]

In [None]:
with open("SearchSpace.txt") as f:
  unique_out = f.readlines()

In [None]:
test = []

for inp in test_inp:
  for out in unique_out:
    test.append([inp, out.strip()])

extra_test = pd.DataFrame(test, columns=["Input", "Output"])   

In [None]:
extra_testting_set = prepare_test_dataset(extra_test, tokenizer, MAX_LEN)
extra_prepared_test = [np.array(extra_testting_set['input_ids'], dtype=int), np.array(extra_testting_set['attention_mask'], dtype=int), np.array(extra_testting_set['token_type_ids'], dtype=int)]

predictions = model.predict(extra_prepared_test)
extra_test["Score"] = predictions

In [None]:
answer = extra_test.groupby("Input").apply(lambda x: x.sort_values(ascending=False, by='Score').head(1))
answer.reset_index(drop=True, inplace=True)
answer

In [None]:
for index, row in answer.iterrows():
  print("Input:", row.Input)
  print("Output:", row.Output)
  print()