<a href="https://colab.research.google.com/github/nevemarpole/DissertationProject/blob/main/Bert_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install necessary libraries
!pip install pytorch-pretrained-bert pytorch-nlp

In [None]:
#Imports
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt
import pickle

In [None]:
#Fetch name of GPU in use
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

#Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
#Read in data files
df_train = pd.read_csv("drive/MyDrive/Colab Notebooks/Dissertation/Data/train.csv", delimiter=',', usecols=('conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance')) 
df_valid = pd.read_csv("drive/MyDrive/Colab Notebooks/Dissertation/Data/valid.csv", delimiter=',', usecols=('conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance')) 
df_test = pd.read_csv("drive/MyDrive/Colab Notebooks/Dissertation/Data/test.csv", delimiter=',', usecols=('conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance'))

print("Data read in")

In [None]:
#Add the tags the BERT model will expect
def prepareData(dataFrame):
    sentences = dataFrame.prompt.values
    
    i = 0
    for this in sentences:
        sentences[i] = str(sentences[i])
        i = i + 1

    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

    return sentences




#Convert string labels into numbers
def prepareLabels(dataFrame):
    dataFrame['context'].replace({"surprised": "0", "excited": "1", "angry": "2", "proud": "3", 
                             "sad": "4", "annoyed": "5", "grateful": "6", "lonely": "7", 
                             "afraid": "8", "terrified": "9", "guilty": "10", "impressed": "11",
                             "disgusted": "12", "hopeful": "13", "confident": "14", 
                             "furious": "15", "anxious": "16", "anticipating": "17",
                             "joyful": "18", "nostalgic": "19", "disappointed": "20",
                             "prepared": "21", "jealous": "22", "content": "23",
                             "devastated": "24", "embarrassed": "25", "caring": "26",
                             "sentimental": "27", "trusting": "28", "ashamed": "29",
                             "apprehensive": "30", "faithful": "31",}, inplace=True)
    
    labels = dataFrame.context.values
    labels = np.array(labels, dtype='float32')
    
    return labels




#Tokenize, convert and pad the data
def tokenizeData(data):
    #Use HuggingFace's BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    
    #The length sentences will be padded or cut to
    MAX_LEN = 128
    
    #Tokenize
    tokenized = [tokenizer.tokenize(section) for section in data]
    
    #Words converted to IDs and padding added, or input shortened
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    return input_ids 
        



#BERT uses attention masks to know which inputs to look at and which to ignore
def applyMasks(input_ids):
    attention_masks = []
    
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    
    return attention_masks 




#Function to calculate time taken for finetuning
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    #Round to second.
    elapsed_rounded = int(round((elapsed)))
    
    #Format
    return str(datetime.timedelta(seconds=elapsed_rounded))    
    
  
    

#Calculate the accuracy of predictions vs. labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)




#Add opening and closing tokens to each bit of data
train_data = prepareData(df_train)
valid_data = prepareData(df_valid)
test_data = prepareData(df_test)

print("Data prepared")


#Change labels from strings to numbers
train_labels = prepareLabels(df_train)
valid_labels = prepareLabels(df_valid)
test_labels = prepareLabels(df_test)

print("Labels prepared")


#Tokenize, convert and pad data
print("Loading BERT tokenizer:")
input_ids_train = tokenizeData(train_data)
input_ids_valid = tokenizeData(valid_data)
input_ids_test = tokenizeData(test_data)

print("Input ID's configured")


#Apply masks to data
attention_masks_train = applyMasks(input_ids_train)
attention_masks_valid = applyMasks(input_ids_valid)
attention_masks_test = applyMasks(input_ids_test)

print("Masks applied")


#convert data into tensors
train_inputs = torch.tensor(input_ids_train)
validation_inputs = torch.tensor(input_ids_valid)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(valid_labels)
train_masks = torch.tensor(attention_masks_train)
validation_masks = torch.tensor(attention_masks_valid)

print("Tensors created")


#Batch size for finetuning
batch_size = 32

#Create Training and Validation dataloader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


#Load BERT model
print("Loading Model:")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=32)
model.to(device)

#Set parameters to pass to HuggingFace's Adam
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.02},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.02}
]

#Use BERTAdam, set learning rate
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)


In [None]:
###                ###
###   FINETUNING   ###
###                ###

t = []
accuracy_info = []

#Keeps track of overall loss
train_loss_set = []

#Number of training epochs
epochs = 1

#For the number of epochs
for epoch in trange(epochs, desc="Epoch"):
  
  print("\n")

  ## TRAINING ##
  
  #Set model into training mode
  model.train()
  
  #Variable to track progress
  tr_loss = 0
  nb_tr_examples = 0
  nb_tr_steps = 0
  start_time = time.time()
  
  for step, batch in enumerate(train_dataloader):
    
    #If this batch is multiple of 50 calculate and print out elapsed time
    if step % 50 == 0 and not step == 0:
            elapsed = format_time(time.time() - start_time)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    
    #Add batch to GPU
    batch = tuple(t.to(device, dtype=torch.int64) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    #Stop gradients from accumulating
    optimizer.zero_grad()
    
    #Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    
    #Backward pass
    loss.backward()
    
    #Step using parameters
    optimizer.step()
    
    #Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  #Print results from this epoch
  print("")
  print("Train loss: {}".format(tr_loss/len(train_dataloader)))
  time_taken = format_time(time.time() - start_time)
  print("Training epcoh took:", time_taken)
    

  ## VALIDATION ##

  #Put model in evaluation modet
  model.eval()

  #Variables to track progress
  eval_loss = 0 
  eval_accuracy = 0
  nb_eval_steps = 0
  nb_eval_examples = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:

    # Add batch to GPU
    batch = tuple(t.to(device, dtype=torch.int64) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    #Stop gradients from accumulating, feezes layers
    with torch.no_grad():
      #Forward pass and calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    #Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    #Calculate accuracy
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1


  print("\nValidation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

  # Record all statistics from this epoch.
  accuracy_info.append(
      {
          'epoch': epoch + 1,
          'Training Loss' : tr_loss,
          'Valid. Accur.' : (eval_accuracy/nb_eval_steps),
          'Training Time': time_taken,
      }
  )

#Finetuning
print("Completed")

In [None]:
#Plot a figure to show training loss
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

In [None]:
#Floats to three decimal places
pd.set_option('precision', 3)

#Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=accuracy_info)

#Set row name to epoch
df_stats = df_stats.set_index('epoch')

#Print the tabel
df_stats

In [None]:
#Save tuned model to disk
with open('finetuned_BERT_model.pkl', 'wb') as fid:
     pickle.dump(model, fid)