<a href="https://colab.research.google.com/github/nevemarpole/DissertationProject/blob/main/Retrieval_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install necessary libraries
!pip install pytorch-pretrained-bert pytorch-nlp

In [2]:
#Imports
import torch
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import random
from math import log10
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
#Fetch name of GPU in use
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

#Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
#Downloads
print("Downloading stop words:")
nltk.download('stopwords')
english_stopwords = stopwords.words('english')
sb_stemmer = SnowballStemmer('english')

print("Loading BERT tokenizer:")
b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#Load in saved BERT model
bert_model = pickle.load(open('drive/MyDrive/Colab Notebooks/Dissertation/BERT/finetuned_BERT_model.pkl', 'rb'))
bert_model.to(device)

In [5]:
#Tokenizes, converts all text form files to lower case, 
#removes stop words, stems the text passed to it
def rework_text(text):

  #Tokenize the text to make it a list of lists
  #Allowing words to be accessed indivdually
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  changed_text = []
  for string in text:
      changed_text.append(tokenizer.tokenize(string))
      
  text = changed_text

  print("Text tokenized")


  #Remove stop words and make all charcters lower case
  documents = []
  english_stopwords = stopwords.words('english')
  for tok_document in text:
      documents.append([word.lower() for word in tok_document if word not in english_stopwords])
  
  #Stem all words
  stemmed_documents = []
  for part in documents:
      stemmed_documents.append([sb_stemmer.stem(word) for word in part]) 
  changed_text = stemmed_documents

  print("Stop words removed and stemmed")

  return changed_text
     





#Determin how similar a user input is to an utterance in the data
def get_similarity(user_text, a_input):

    #Store the user's input and the file utterance in a list
    #Accessed one after another by the following loops
    strings = [user_text, a_input] 
    
    #Create vocabulary for the user's input and file data
    vocabulary = []
    for string in strings:
        for item in string:
            #index = vocabulary.index(item)
            #vector[index] +=1
            if item not in vocabulary:
                vocabulary.append(item)
    
    #Create bag of words for the user's input and the inputs in the file
    bow = []
    for string in strings:
        vector = np.zeros(len(vocabulary))
        for item in string:
            index = vocabulary.index(item)
            vector[index] += 1
        bow.append(vector)
        
    #Get TF-IDF, the multiple return values stored as a list
    result = tfidf_weight(bow[0], bow[1])

    #Get manhattan distance and use this to work out final similarity value
    distance = manhattan_distance(result[0], result[1])
    similarity = 1 / (1+distance)
    
    return similarity






#Works out the TF-IDF for both vectors passed to it
def tfidf_weight(vector_1, vector_2):
    
    N = 2
    tfidf_vector_1 = np.zeros(len(vector_1))
    tfidf_vector_2 = np.zeros(len(vector_2))
    
    for i in range(len(vector_1)):
        
        term_booleans = [vector_1[i]!=0, vector_2[i]!=0]
        n = sum(term_booleans)
        
        frequency_1 = vector_1[i]
        tfidf_1 = log10(1+frequency_1) * log10(N/n)
        tfidf_vector_1[i] = tfidf_1
        
        frequency_2 = vector_2[i]
        tfidf_2 = log10(1+frequency_2) * log10(N/n)
        tfidf_vector_2[i] = tfidf_2
        
    return tfidf_vector_1, tfidf_vector_2






#Works out the manhattan distance between the 2 vectors
def manhattan_distance(vector_1, vector_2):
    
  distance = abs(vector_1 - vector_2)
    
  return distance.sum()


In [6]:
def emotion_prediction(input_id, input_mask):

  #create tensors   
  prediction_inputs = torch.tensor(input_id)
  prediction_masks = torch.tensor(input_mask)
    
  #Only ever one input at a time to evaluate, so this stays at 1  
  batch_size = 1

  #Create Dataset/loader
  prediction_data = TensorDataset(prediction_inputs, prediction_masks)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


  # Put model in evaluation mode
  bert_model.eval()

  #To store result
  prediction = []

  
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    #Unpack dataloader tuple
    b_input_ids, b_input_mask = batch
    #Don't compute or store gradients
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = bert_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits to CPU
    logits = logits.detach().cpu().numpy()
    
    # Store prediction
    prediction.append(logits)    

  return prediction




#Add the tags the BERT model will expect
def prepare_data(data):

  data = str(data)
  data = ["[CLS] " + data + " [SEP]"]

  return data




#Tokenize, convert and pad the sentence
def tokenize_data(data):
  #The length sentences will be padded or cut to
  max_len = 128
  
  #Tokenize
  tokenized = [b_tokenizer.tokenize(section) for section in data]
  
  #Words converted to IDs and padding added, or input shortened
  input_ids = [b_tokenizer.convert_tokens_to_ids(x) for x in tokenized]
  input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")

  return input_ids




#Bert requires masks
def apply_masks(input_ids):
  attention_masks = []
  
  for section in input_ids:
    sec_mask = [float(i>0) for i in section]
    attention_masks.append(sec_mask)
  
  return attention_masks

In [None]:
#To convert the numbered class BERT will provide to the corresponding emotion
classes = ["surprised", "excited", "angry", "proud", "sad", "annoyed", 
           "grateful", "lonely", "afraid", "terrified", "guilty", "impressed",
           "disgusted", "hopeful", "confident", "furious", "anxious", 
           "anticipating","joyful", "nostalgic", "disappointed","prepared", 
           "jealous", "content", "devastated", "embarrassed", "caring",
           "sentimental", "trusting", "ashamed", "apprehensive", "faithful"]

#Arrays to hold relevant file data fields
u_id = []
emotions = []
utterances = []

#Read in the input comparison file's relevant fields
with open('drive/MyDrive/Colab Notebooks/Dissertation/Data/train.csv', encoding='UTF-8') as csvDataFile:
  csvReader = csv.reader(csvDataFile)
  for row in csvReader:
    u_id.append(row[1])
    emotions.append(row[2])
    utterances.append(row[5])
u_id.remove('utterance_idx')
emotions.remove('context')
utterances.remove('utterance')

with open('drive/MyDrive/Colab Notebooks/Dissertation/Data/valid.csv', encoding='UTF-8') as csvDataFile:
  csvReader = csv.reader(csvDataFile)
  for row in csvReader:
    u_id.append(row[1])
    emotions.append(row[2])
    utterances.append(row[5])
u_id.remove('utterance_idx')
emotions.remove('context')
utterances.remove('utterance')

with open('drive/MyDrive/Colab Notebooks/Dissertation/Data/test.csv', encoding='UTF-8') as csvDataFile:
  csvReader = csv.reader(csvDataFile)
  for row in csvReader:
    u_id.append(row[1])
    emotions.append(row[2])
    utterances.append(row[5])
u_id.remove('utterance_idx')
emotions.remove('context')
utterances.remove('utterance')

#Fix broken file data
for i in range(len(u_id)):
  if u_id[i] == '':
    u_id[i] = '1'

#Convert ID's from strings to numbers
for i in range(len(u_id)):
  u_id[i] = int(u_id[i])


#tokenize and stem the file data
input_sentences = rework_text(utterances)

In [None]:
#Say hello to the user
print('Hi there!')


#Until the user asks to stop the chatbot will talk to them
stop = False

while not stop:
    
  highest_sim = 0
  highest_e_sim = 0
  highest_emotion = 0
  emotion_position = 0
  found = False
  text = []
  
  #Ask the user for an input
  user_input = input('Please enter a sentence, or to bring this conversation to an end ask me to stop:\n')
  

  #Lower and tokenize the user's input to make it workable
  user_input = user_input.lower()
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  input_tokenized = []
  input_tokenized.append(tokenizer.tokenize(user_input))
  
  #As input_tokenized makes a list of lists use this to just get
  #the list of words as tokens
  token_list = input_tokenized[0]

  #If the input is stop then end the conversation
  if token_list[0] == "stop" and len(token_list) == 1:
      print("Goodbye!")
      stop = True


  #The input isn't stop so a response must be chosen        
  if stop == False: 
      #get the most up to date user input to classify the emotion of
      text.append(user_input)
      new_X = text

      #Prepare input for BERT model
      tagged = prepare_data(new_X)
      input_id = tokenize_data(tagged)
      mask = apply_masks(input_id)

      predictions = emotion_prediction(input_id, mask)
      predictions = predictions[0]
      predictions = predictions.astype(int)
      predictions = predictions[0]

      #Look through the returned values from BERT
      #The highest value is the most likely emotional match
      for i in range(len(predictions)):
        if predictions[i] > highest_emotion:
          emotion_position = i
          highest_emotion = predictions[i]

      #Fetch the label associated with this class number
      new_y = classes[emotion_position]

      #This print is here to guage accuracy of classifier prediction
      print("Predicted emotion:", new_y)

      #Checks for similarity in the dataset of inputs it has 
      #Keeps track of similarity to utterances with a matching emotional tag
      #seperatly from those with a different associated emotion
      for i in range(len(input_sentences)):
        sim_value = get_similarity(token_list, input_sentences[i])
        if sim_value >= highest_e_sim:
          if new_y == emotions[i]:
            e_match = i
            highest_e_sim = sim_value
        if sim_value >= highest_sim:
          match = i
          highest_sim = sim_value


      #If the emotions match use this response even if it has a slightly lower
      #similarity value
      if highest_e_sim > 0.5:
        found = True
        #Only use the next utterance if it is a part of the same conversation
        if u_id[e_match] < u_id[e_match + 1]:
          print(utterances[e_match + 1], "\nData emotion:", emotions[e_match], "\n")
        else:
          print(utterances[e_match], "\nData emotion:", emotions[e_match], "\n")
      
      #Even if the emotions don't match, if the similairity is high then
      #fetch the response
      elif highest_sim > 0.7:
        found = True
        #Only use the next utterance if it is a part of the same conversation
        if u_id[match] < u_id[match + 1]:
          print(utterances[match + 1], "\nExpected emotion:", emotions[match], "\n")
        else:
          print(utterances[match], "\nExpected emotion:", emotions[match], "\n")



  #If no response can be matched to an input the chatbot informs a user
  #that they don't know how to responde 
  if stop == False and found == False:
    print("I'm sorry I don't understand, please say something else and I'll try again!\n")