<a href="https://colab.research.google.com/github/nevemarpole/DissertationProject/blob/main/BERT_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install necessary libraries
!pip install pytorch-pretrained-bert pytorch-nlp

In [None]:
#Imports
import torch
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import csv
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
#Fetch name of GPU in use
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

#Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
print("Loading BERT tokenizer:")
b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
#Load in saved BERT model with 1 epoch finetuning
bert_model = pickle.load(open('drive/MyDrive/Colab Notebooks/Dissertation/BERT/finetuned_BERT_model.pkl', 'rb'))
bert_model.to(device)

## OR ##
#Load in saved BERT model with 2 epoch finetuning
#bert_model = pickle.load(open('drive/MyDrive/Colab Notebooks/Dissertation/BERT/finetuned_BERT_model_2e.pkl', 'rb'))
#bert_model.to(device)

In [None]:
#Add the tags the BERT model will expect
def prepareData(dataFrame):
    sentences = dataFrame.prompt.values
    
    i = 0
    for this in sentences:
        sentences[i] = str(sentences[i])
        i = i + 1

    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

    return sentences




#Convert string labels into numbers
def prepareLabels(dataFrame):
    dataFrame['context'].replace({"surprised": "0", "excited": "1", "angry": "2", "proud": "3", 
                             "sad": "4", "annoyed": "5", "grateful": "6", "lonely": "7", 
                             "afraid": "8", "terrified": "9", "guilty": "10", "impressed": "11",
                             "disgusted": "12", "hopeful": "13", "confident": "14", 
                             "furious": "15", "anxious": "16", "anticipating": "17",
                             "joyful": "18", "nostalgic": "19", "disappointed": "20",
                             "prepared": "21", "jealous": "22", "content": "23",
                             "devastated": "24", "embarrassed": "25", "caring": "26",
                             "sentimental": "27", "trusting": "28", "ashamed": "29",
                             "apprehensive": "30", "faithful": "31",}, inplace=True)
    
    labels = dataFrame.context.values
    labels = np.array(labels, dtype='float32')
    
    return labels




#Tokenize, convert and pad the data
def tokenizeData(data):
    #Use HuggingFace's BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    
    #The length sentences will be padded or cut to
    MAX_LEN = 128
    
    #Tokenize
    tokenized = [tokenizer.tokenize(section) for section in data]
    
    #Words converted to IDs and padding added, or input shortened
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    return input_ids




#BERT requires words to be masked to learn
def applyMasks(input_ids):
    attention_masks = []
    
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    
    return attention_masks     

In [None]:
def emotion_prediction(input_id, input_mask):

  #create tensors   
  prediction_inputs = torch.tensor(input_id)
  prediction_masks = torch.tensor(input_mask)
    
  batch_size = 1

  #Create Dataset/loader
  prediction_data = TensorDataset(prediction_inputs, prediction_masks)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


  # Put model in evaluation mode
  bert_model.eval()

  #To store result
  prediction = []

  
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    #Unpack dataloader tuple
    b_input_ids, b_input_mask = batch
    #Don't compute or store gradients
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = bert_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits to CPU
    logits = logits.detach().cpu().numpy()
    
    # Store prediction
    prediction.append(logits)    

  return prediction




#Add the tags the BERT model will expect
def prepare_data(data):

  data = str(data)
  data = ["[CLS] " + data + " [SEP]"]

  return data




#Tokenize, convert and pad the sentence
def tokenize_data(data):
  #The length sentences will be padded or cut to
  max_len = 128
  
  #Tokenize
  tokenized = [b_tokenizer.tokenize(section) for section in data]
  
  #Words converted to IDs and padding added, or input shortened
  input_ids = [b_tokenizer.convert_tokens_to_ids(x) for x in tokenized]
  input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post")

  return input_ids




#Bert requires masks
def apply_masks(input_ids):
  attention_masks = []
  
  for section in input_ids:
    sec_mask = [float(i>0) for i in section]
    attention_masks.append(sec_mask)
  
  return attention_masks

In [None]:
emotions = []
utterances = []

#Read in test data from Rashkin. et al. file
#with open('drive/MyDrive/Colab Notebooks/Dissertation/Data/test.csv', encoding='UTF-8') as csvDataFile:  csvReader = csv.reader(csvDataFile)
#  for row in csvReader:
#    emotions.append(row[2])
#    utterances.append(row[3])
#emotions.remove('context')
#utterances.remove('prompt')


## OR ##
#Read in new test phrases file
with open('drive/MyDrive/Colab Notebooks/Dissertation/Data/new_test_phrases.csv', encoding='UTF-8') as csvDataFile:
  csvReader = csv.reader(csvDataFile)
  for row in csvReader:
    emotions.append(row[1].lower())
    utterances.append(row[0])

In [None]:
#To convert the numbered class BERT will provide to the corresponding emotion
classes = ["surprised", "excited", "angry", "proud", "sad", "annoyed", 
           "grateful", "lonely", "afraid", "terrified", "guilty", "impressed",
           "disgusted", "hopeful", "confident", "furious", "anxious", 
           "anticipating","joyful", "nostalgic", "disappointed","prepared", 
           "jealous", "content", "devastated", "embarrassed", "caring",
           "sentimental", "trusting", "ashamed", "apprehensive", "faithful"]

In [None]:
correct = 0
run = 0
predicted_emotions = []

for i in range(len(utterances)):  

      highest_emotion = 0    
      #Prepare input for BERT model
      tagged = prepare_data(utterances[i])
      input_id = tokenize_data(tagged)
      mask = apply_masks(input_id)

      predictions = emotion_prediction(input_id, mask)
      predictions = predictions[0]
      predictions = predictions.astype(int)
      predictions = predictions[0]


      #Look through the returned values from BERT
      #The highest value is the most likely emotional match
      for j in range(len(predictions)):
        if predictions[j] > highest_emotion:
          emotion_position = j
          highest_emotion = predictions[j]    

      #Fetch the label associated with this class number
      predicted = classes[emotion_position]

      #Collects total number of correct predictions
      if predicted == emotions[i]:
        correct = correct + 1

      #Collects total number of test
      run = run + 1

      predicted_emotions.append(emotion_position)

In [None]:
accuracy = ((correct / run) * 100)

print("Accuracy:", accuracy)