In [None]:
!pip install transformers
!pip install torchmetrics
!pip install seqeval
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 25.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 101.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 68.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████

# **Data**

In [None]:
import json
import csv
import pandas as pd
import sklearn
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import transformers
from transformers import BertTokenizerFast
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
import numpy as np
from transformers import get_scheduler
import evaluate
from evaluate import load
from tqdm.auto import tqdm
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import matplotlib.pyplot as plt
import scipy
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import collections

In [None]:
# Load in json file and organize data into two dataframes for inputs and targets respectively
def get_data(file):
  f = open(file)
  data = json.load(f)
  x_df = pd.DataFrame(columns = ['context','question'])
  y_df = pd.DataFrame(columns = ['start_pos','end_pos'])
  for dat in data.get('data'):
    for par in dat.get('paragraphs'):
      # get context
      context = ""
      for utts in par.get('utterances:'):
        # save string of format: "<SPEAKER> speaker <SPEAKER> words words words <SPEAKER> speaker <SPEAKER> words words ..." etc
        speakers = " ".join(utts.get('speakers'))
        if speakers == '#NOTE#':
          speakers = '<NOTE>'
        utterance = utts.get('utterance')
        #if necessary can remove speaker entirely
        joined = '<SPEAKER> ' + speakers + ' <SPEAKER> ' + utterance
        context = context + joined

      # get question and answers corresponding to this context
      for qas in par.get('qas'):
        question = qas.get('question')
        # add input to this dataframe
        x_df.loc[len(x_df)] = [context, question]

        # use first possible answer as target for this question
        answer = qas.get('answers')[0]
        text = answer.get('answer_text')
        s_ind = context.index(text)
        e_ind = s_ind + len(text)

        # add target tuple to dataframe
        y_df.loc[len(y_df)] = [torch.tensor(s_ind),torch.tensor(e_ind)]
  return x_df,y_df

In [None]:
# Make dataframes for each dataset split
x_train,y_train = get_data('friendsqa_trn.json')
x_valid,y_valid = get_data('friendsqa_dev.json')
x_test,y_test = get_data('friendsqa_tst.json')

# Data Analysis

Plot is_speaker count distributions for each dataset split

In [None]:
# Plot counts of train targets that are marked as speakers
trn_is_speaker_counts = y_train['is_speaker'].value_counts()
print(trn_is_speaker_counts)
plt.figure(figsize=(4, 4))
ax = trn_is_speaker_counts.plot(kind="bar")
ax.set_xlabel('is_speaker')
ax.set_ylabel("Frequency")
ax.set_title('Training Targets is_speaker Frequency')
ax.set_xticklabels(labels=['False','True'])
plt.xticks(rotation = 0)
plt.show()

In [None]:
# Plot counts of validation targets that are marked as speakers
valid_is_speaker_counts = y_valid['is_speaker'].value_counts()
print(valid_is_speaker_counts)
plt.figure(figsize=(4, 4))
ax = valid_is_speaker_counts.plot(kind="bar")
ax.set_xlabel('is_speaker')
ax.set_ylabel("Frequency")
ax.set_title('Validation Targets is_speaker Frequency')
ax.set_xticklabels(labels=['False','True'])
plt.xticks(rotation = 0)
plt.show()

In [None]:
# Plot counts of test targets that are marked as speakers
tst_is_speaker_counts = y_test['is_speaker'].value_counts()
print(tst_is_speaker_counts)

plt.figure(figsize=(4, 4))
ax = tst_is_speaker_counts.plot(kind="bar")
ax.set_xlabel('is_speaker')
ax.set_ylabel("Frequency")
ax.set_title('Test Targets is_speaker Frequency')
ax.set_xticklabels(labels=['False','True'])
plt.xticks(rotation = 0)
plt.show()

Plot context length distributions for each dataset split

In [None]:
# Get context lengths for training set
trn_context_lengths = x_train['context'].str.split().str.len().value_counts()
trn_context_lengths = trn_context_lengths.reset_index()
trn_context_lengths = trn_context_lengths.rename(columns={'index': 'length','text':'frequency'})

# Print length (number of words) of shortest context
print("Shortest context len: {}".format(min(trn_context_lengths['length'])))
# Print average length of context
print("Mean context len: {}".format(sum(trn_context_lengths['length'])/len(trn_context_lengths['length'])))
# Print length of longest context
print("Longest context len: {}".format(max(trn_context_lengths['length'])))

# Plot context length distribution for training set
plt.hist(x_train['context'].str.split().str.len(),bins = 20)
plt.xlabel("Context Lengths")
plt.ylabel("Frequency")
plt.title('Training Context Length Frequencies')
plt.figure(figsize=(3, 9))
plt.show()

In [None]:
# Get context lengths for validation set
valid_context_lengths = x_valid['context'].str.split().str.len().value_counts()
valid_context_lengths = valid_context_lengths.reset_index()
valid_context_lengths = valid_context_lengths.rename(columns={'index': 'length','text':'frequency'})

# Print length (number of words) of shortest context
print("Shortest context len: {}".format(min(valid_context_lengths['length'])))
# Print average length of context
print("Mean context len: {}".format(sum(valid_context_lengths['length'])/len(valid_context_lengths['length'])))
# Print length of longest context
print("Longest context len: {}".format(max(valid_context_lengths['length'])))

# Plot context length distribution for validation set
plt.hist(x_valid['context'].str.split().str.len(),bins = 20)
plt.xlabel("Context Lengths")
plt.ylabel("Frequency")
plt.title('Validation Context Length Frequencies')
plt.figure(figsize=(3, 9))
plt.show()

In [None]:
# Get context lengths for test set
tst_context_lengths = x_test['context'].str.split().str.len().value_counts()
tst_context_lengths = tst_context_lengths.reset_index()
tst_context_lengths = tst_context_lengths.rename(columns={'index': 'length','text':'frequency'})

# Print length (number of words) of shortest context
print("Shortest context len: {}".format(min(tst_context_lengths['length'])))
# Print average length of context
print("Mean context len: {}".format(sum(tst_context_lengths['length'])/len(tst_context_lengths['length'])))
# Print length of longest context
print("Longest context len: {}".format(max(tst_context_lengths['length'])))

# Plot context length distribution for test set
plt.hist(x_test['context'].str.split().str.len(),bins = 20)
plt.xlabel("Context Lengths")
plt.ylabel("Frequency")
plt.title('Test Context Length Frequencies')
plt.figure(figsize=(3, 9))
plt.show()

Explore the range of context lengths in the SQuAD 1.1 training data to compare against the FriendsQA data

In [None]:
# Make dataframe of SQuAD training data
f = open('train-v1.1.json')
data = json.load(f)
squad_contexts = pd.DataFrame(columns = ['context'])
count = 0
for dat in data.get('data'):
  for par in dat.get('paragraphs'):
    context = par.get('context')
    squad_contexts.loc[len(squad_contexts)] = [context]
    count += 1

# Get context lengths
squad_context_lengths = squad_contexts['context'].str.split().str.len().value_counts()
squad_context_lengths = squad_context_lengths.reset_index()
squad_context_lengths = squad_context_lengths.rename(columns={'index': 'length','text':'frequency'})

# Print length (number of words) of shortest context
print("Shortest context len: {}".format(min(squad_context_lengths['length'])))
# Print average length of context
print("Mean context len: {}".format(sum(squad_context_lengths['length'])/len(squad_context_lengths['length'])))
# Print length of longest context
print("Longest context len: {}".format(max(squad_context_lengths['length'])))

# BERT

In [None]:
class BERTBASEQA(nn.Module):

  def __init__(self, bert_type, hidden_size, num_labels):
    super(BERTBASEQA, self).__init__()
    self.bert_type = bert_type
    self.hidden_size = hidden_size
    self.num_labels = num_labels
    self.bert = transformers.BertModel.from_pretrained(self.bert_type)
    self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)

  def forward(self, ids, token_ids):

    output = self.bert(
                      input_ids = ids, 
                      token_type_ids = token_ids
                      )
    
    sequence_output = output[0]   #(None, seq_len, hidden_size)
    logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
    start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
    start_logits = start_logits.squeeze(-1)  #(None, seq_len)
    end_logits = end_logits.squeeze(-1)    #(None, seq_len)


    outputs = (start_logits, end_logits,) 
    
    return outputs  

In [None]:
def loss_func(out, s_target, e_target):
  criterion = nn.CrossEntropyLoss()
  s_loss = criterion(out[0], s_target)
  e_loss = criterion(out[1], e_target)
  total_loss = s_loss+e_loss
  return total_loss

Dataloader

In [None]:
class BertDataset(Dataset):
  def __init__(self, tokenizer, context, question, max_length, text):
    self.context = context
    self.question = question
    self.text = text
    self.tokenizer = tokenizer
    self.max_length = max_length
    
  def __len__(self):
        return len(self.context)
  
  def __getitem__(self, idx):
    context_ = self.context[idx]
    question_ = self.question[idx]
    text_ = self.text[idx]
    
    #encoding
    input_ids = self.tokenizer.encode(question_, context_,padding=True,truncation=True,max_length=500, add_special_tokens = True)
    answer_ids = self.tokenizer.encode(text_,padding=True,truncation=True,max_length=500, add_special_tokens = True)
    token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
    
    #calculating start and end position of answer in input_ids
    s_pos, e_pos = 0, 0
    for i in range(len(input_ids)):
      if (input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]):
        s_pos = i
        e_pos = i + len(answer_ids[1:-1]) - 1
        break

    assert((s_pos<len(input_ids)) & (e_pos<len(input_ids)) & (s_pos<=e_pos))
    
    if (len(input_ids)<self.max_length):
      padding_len = self.max_length - len(input_ids)
      ids = input_ids + ([0]*padding_len)
    else:
      ids = input_ids[:self.max_length]

    if (len(token_type_ids)<self.max_length):
      padding_len = self.max_length - len(token_type_ids)
      token_ids = token_type_ids  + ([1]*padding_len)
    else:
      token_ids = token_type_ids[:self.max_length]
 
    return {'ids': torch.tensor(ids, dtype = torch.long),
            'token_type_ids': torch.tensor(token_ids, dtype = torch.long),
            'start_pos': torch.tensor(s_pos, dtype = torch.long),
            'end_pos': torch.tensor(e_pos, dtype = torch.long)}          

Training BERT Baseline Model

In [None]:
def train(dataloader, model, optimizer, device, max_grad_norm, scheduler=None):
  model.train()
  for bi, d in enumerate(notebook.tqdm(dataloader, desc="Iteration")):
    ids = d['ids']
    # mask_ids = d['mask']
    token_ids = d['token_type_ids']
    start_pos = d['start_pos']
    end_pos = d['end_pos']

    ids = ids.to(device, dtype = torch.long)
    # mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    start_pos = start_pos.to(device, dtype = torch.long)
    end_pos = end_pos.to(device, dtype = torch.long)

    optimizer.zero_grad()
    start_and_end_scores = model(ids, token_ids)
    # start_scores, end_scores = model(ids, token_ids)
    loss = loss_func(start_and_end_scores, start_pos, end_pos)
    # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    loss.backward()
    optimizer.step()
    if scheduler is not None:
      scheduler.step()
    if bi%100==0:
      print (f"bi: {bi}, loss: {loss}")
    all_train_loss.append(loss)

In [None]:
def eval(dataloader, model, device):
  model.eval()
  pred_s = None
  pred_e = None
  eval_loss = 0.0
  eval_steps = 0

  for bi, d in enumerate(dataloader):
    ids = d['ids']
    # mask_ids = d['mask']
    token_ids = d['token_type_ids']
    start_pos = d['start_pos']
    end_pos = d['end_pos']

    ids = ids.to(device, dtype = torch.long)
    # mask_ids = mask_ids.to(device, dtype = torch.long)
    token_ids = token_ids.to(device, dtype = torch.long)
    start_pos = start_pos.to(device, dtype = torch.long)
    end_pos = end_pos.to(device, dtype = torch.long)

    with torch.no_grad():
      start_and_end_scores = model(ids, token_ids)
      loss = loss_func(start_and_end_scores, start_pos, end_pos)
      eval_loss += loss.mean().item()
    
    eval_steps += 1
    if pred_s is None:
      pred_s = start_and_end_scores[0].detach().cpu().numpy()
      pred_e = start_and_end_scores[1].detach().cpu().numpy()
    else:
      pred_s = np.append(pred_s, start_and_end_scores[0].detach().cpu().numpy(), axis=0)
      pred_e = np.append(pred_e, start_and_end_scores[1].detach().cpu().numpy(), axis=0)

  eval_loss = eval_loss/eval_steps
  pred_start = np.argmax(pred_s, axis=1)
  pred_end = np.argmax(pred_e, axis=1)
  all_val_loss.append(eval_loss)
  return eval_loss, pred_start, pred_end

In [None]:
MAX_SEQ_LENGTH = 512
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 2
BERT_TYPE = "bert-base-uncased"
max_grad_norm = 1.0

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained(BERT_TYPE)
train_dataset = BertDataset(
    tokenizer = tokenizer,
    context = train_data['context'],
    question = train_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = train_data['text']
)

train_dataloader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, shuffle=True)

In [None]:
eval_dataset = BertDataset(
    tokenizer = tokenizer,
    context = valid_data['context'],
    question = valid_data['question'],
    max_length = MAX_SEQ_LENGTH,
    text = valid_data['text']
) 

eval_dataloader = DataLoader(eval_dataset, batch_size = EVAL_BATCH_SIZE, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = BERTBASEQA(BERT_TYPE, 768, 2).to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)

NUM_TRAIN_STEPS = int(len(train_dataset)/TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 
scheduler = transformers.get_constant_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=500,
                # num_training_steps=NUM_TRAIN_STEPS,
                last_epoch=-1)

In [None]:
#training
for epoch in trange(NUM_TRAIN_EPOCHS):
  train(train_dataloader, model, optimizer, device, max_grad_norm, scheduler)

Evaluate Model

In [None]:
res = eval(eval_dataloader, model, device)
print(res[0])

In [None]:
context_ = valid_data['context']
question_ = valid_data['question']
text_ = valid_data['text']
pred_start = res[1]
pred_end = res[2]
res_text_ = []
act_start = []
act_end = []


input_ids_list = list(map(lambda x,y: tokenizer.encode(x, y, padding=True,truncation=True,max_length=500, add_special_tokens = True), question_, context_))
answer_ids_list = list(map(lambda x: tokenizer.encode(x,padding=True,truncation=True,max_length=500, add_special_tokens = True), text_))

for i in range(len(input_ids_list)):
  res_text_.append(tokenizer.decode(input_ids_list[i][pred_start[i]:pred_end[i]+1]))

  s_pos, e_pos = 0, 0
  for j in range(len(input_ids_list[i])):
    if (input_ids_list[i][j: j+len(answer_ids_list[i][1:-1])] == answer_ids_list[i][1:-1]):
      s_pos = j
      e_pos = j + len(answer_ids_list[i][1:-1]) - 1
      break
  act_start.append(s_pos)
  act_end.append(e_pos)

In [None]:
valid_data['start_pos'] = act_start
valid_data['end_pos'] = act_end
valid_data['predicted_text'] = res_text_
valid_data['predicted_start_pos'] = pred_start
valid_data['predicted_end_pos'] = pred_end

In [None]:
show_columns = ['text', 'predicted_text', 'start_pos', 'end_pos', 'predicted_start_pos', 'predicted_end_pos']
valid_data[show_columns].head(20)

In [None]:
# calculates incorrect data based on start and end positions
cond1 = valid_data['predicted_start_pos']>valid_data['predicted_end_pos']
cond2 = valid_data['end_pos']<valid_data['predicted_start_pos']
cond3 = valid_data['start_pos']>valid_data['predicted_end_pos']

incorrect_pred = valid_data[(cond1) | (cond2) | (cond3)].shape[0]
incorrect_pred

In [None]:
t = valid_data.shape[0]
print(f"accuracy = {(t - incorrect_pred)*100/t}")

# Finetuning BERT for QA

Format data for this model

In [None]:
# Dataset class
class BERTDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __getitem__(self,i):
    input = self.data[i][0]
    attention = self.data[i][1]
    start = self.data[i][2]
    end = self.data[i][3]
    return [input,attention,start,end]

  def __len__(self):
    return len(self.data)

In [None]:
# Get lists of contexts for each split
context_trn = list(x_train['context'])
context_valid = list(x_valid['context'])
context_tst = list(x_test['context'])

# Get lists of questions for each split
question_trn = list(x_train['question'])
question_valid = list(x_valid['question'])
question_tst = list(x_test['question'])

# Encode data using the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(context_trn, question_trn, truncation=True, padding=True)
valid_encodings = tokenizer(context_valid, question_valid, truncation=True, padding=True)
test_encodings = tokenizer(context_tst, question_tst, truncation=True, padding=True)

# Get lists of start positions
train_start = list(y_train['start_pos'])
valid_start = list(y_valid['start_pos'])
test_start = list(y_test['start_pos'])

# Get lists of end positions
train_end = list(y_train['end_pos'])
valid_end = list(y_valid['end_pos'])
test_end = list(y_test['end_pos'])

# Get data for training from tokenizer encodings
train_inputs = train_encodings['input_ids']
train_sents = train_encodings['token_type_ids']
train_attention = train_encodings['attention_mask']
train_tokens = [tokenizer.convert_ids_to_tokens(i) for i in train_inputs]

# Get data for validation from tokenizer encodings
valid_inputs = valid_encodings['input_ids']
valid_sents = valid_encodings['token_type_ids']
valid_attention = valid_encodings['attention_mask']
valid_tokens = [tokenizer.convert_ids_to_tokens(i) for i in valid_inputs]

# Get data for testing from tokenizer encodings
test_inputs = test_encodings['input_ids']
test_sents = test_encodings['token_type_ids']
test_attention = test_encodings['attention_mask']
test_tokens = [tokenizer.convert_ids_to_tokens(i) for i in test_inputs]

KeyboardInterrupt: ignored

In [None]:
# Make datasets for each split using one tenth of the data
train_dataset = BERTDataset(list(zip(torch.tensor(train_inputs[0:int(len(train_inputs)/10)]),torch.tensor(train_attention[0:int(len(train_attention)/10)]),torch.stack(train_start[0:int(len(train_start)/10)]),torch.stack(train_end[0:int(len(train_end)/10)]))))
valid_dataset = BERTDataset(list(zip(torch.tensor(valid_inputs[0:int(len(valid_inputs)/10)]),torch.tensor(valid_attention[0:int(len(valid_attention)/10)]),torch.stack(valid_start[0:int(len(valid_start)/10)]),torch.stack(valid_end[0:int(len(valid_end)/10)]))))
test_dataset = BERTDataset(list(zip(torch.tensor(test_inputs[0:int(len(test_inputs)/10)]),torch.tensor(test_attention[0:int(len(test_attention)/10)]),torch.stack(test_start[0:int(len(test_start)/10)]),torch.stack(test_end[0:int(len(test_end)/10)]))))

# Create dataloaders
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

Define model and training parameters

In [None]:
# Model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model_checkpoint = 'bert-base-uncased'

In [None]:
# Init parameters for training
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
model = model.to(device)
progress_bar = tqdm(range(num_training_steps))

Define functions for evaluation metrics

In [None]:
# Calculate F1 Score
def compute_f1(predictions, targets):
  avg_f1 = 0
  for i in range(len(predictions)):
    f1 = 0

    curr_pred = predictions[i]
    curr_targ = targets[i]
    shared_tokens = [x for x in curr_pred.split() if x in curr_targ.split()]

    # if no tokens are the same then f1 = 0
    if len(shared_tokens) == 0:
        f1 = 0
    else:
      precision = len(shared_tokens) / len(curr_pred.split())
      recall = len(shared_tokens) / len(curr_targ.split())
      
      f1 = 2 * (precision * recall) / (precision + recall)
    avg_f1 += f1
  return avg_f1 / len(predictions)

In [None]:
# Compute exact match, ignoring case
def compute_em(predictions,targets):
  em = 0
  for i in range(len(predictions)):
    curr_pred = predictions[i].lower()
    curr_targ = targets[i].lower()

    if curr_pred == curr_targ:
      em += 1

  return round(em / len(predictions), 2)

Training

In [None]:
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        inputs = batch[0].to(device)
        attention = batch[1].to(device)
        start_positions = batch[2].to(device)
        end_positions = batch[3].to(device)

        outputs = model(inputs,attention_mask = attention, start_positions=start_positions, end_positions=end_positions)

        # Compute loss, gradients and update model
        loss = outputs.loss
        print("loss: {}".format(outputs.loss))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


Validation

In [None]:
# Init variables for keeping track of f1 and em during validation
v_count = 0
v_avg_f1 = 0
v_avg_em = 0
f1_val = []

model.eval()
for batch in valid_dataloader:
    with torch.no_grad():
        v_count += 1
        inputs = batch[0].to(device)
        attention = batch[1].to(device)
        start_positions = batch[2].to(device)
        end_positions = batch[3].to(device)

        # Get model predictions
        outputs = model(inputs,attention_mask = attention, start_positions=start_positions, end_positions=end_positions)

        # Get predicted start and end indices from model outputs
        start_logits = outputs.start_logits
        start_preds = torch.argmax(start_logits,dim=1)
        end_logits = outputs.end_logits
        end_preds = torch.argmax(end_logits,dim=1)
        
        # Convert predicted indices to answer spans
        predictions = []
        for i in range(len(start_preds)):
          answer_start = start_preds[i]
          answer_end = end_preds[i]
          answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][answer_start:answer_end]))
          predictions.append(answer)

        # Convert target indices to answer spans
        targets = []
        for i in range(len(start_positions)):
          answer_start = start_positions[i]
          answer_end = end_positions[i]
          answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][answer_start:answer_end]))
          targets.append(answer)

        # Calculate f1
        v_f1 = compute_f1(predictions, targets)
        f1_val.append(v_f1)
        v_avg_f1 += v_f1
        print("F1 for this batch: {}".format(v_f1))
        print('f1 validation', f1_val)

        # Calculate exact match
        v_em = compute_em(predictions, targets)
        print("EM for this batch: {}".format(v_em))
        v_avg_em += v_em

        """# count all exact match
        for i in range(len(targets)):
          if targets[i] == predictions[i]:
            count_EM +=1"""

# Print average F1 score and exact match
print("Average Validation F1: {}".format(v_avg_f1/v_count))
print("Average Validation EM: {}".format(v_avg_em/v_count))

Testing

In [None]:
# Init variables for keeping track of f1 and em during validation
t_count = 0
t_avg_f1 = 0
t_avg_em = 0
f1_test = []

model.eval()
for batch in test_dataloader:
    with torch.no_grad():
        t_count += 1
        inputs = batch[0].to(device)
        attention = batch[1].to(device)
        start_positions = batch[2].to(device)
        end_positions = batch[3].to(device)

        # Get model predictions
        outputs = model(inputs,attention_mask = attention, start_positions=start_positions, end_positions=end_positions)

        start_logits = outputs.start_logits
        start_preds = torch.argmax(start_logits,dim=-1)
        end_logits = outputs.end_logits
        end_preds = torch.argmax(end_logits,dim=-1)

        # Convert predicted indices to answer spans
        predictions = []
        for i in range(len(start_preds)):
          answer_start = start_preds[i]
          answer_end = end_preds[i]
          answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][answer_start:answer_end]))
          predictions.append(answer)

        # Convert target indices to answer spans
        targets = []
        for i in range(len(start_positions)):
          answer_start = start_positions[i]
          answer_end = end_positions[i]
          answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][answer_start:answer_end]))
          targets.append(answer)

        # Calculate f1
        t_f1 = compute_f1(predictions, targets)
        f1_test.append(t_f1)
        t_avg_f1 += t_f1
        print("F1 for this batch: {}".format(t_f1))
        print('f1 test', f1_test)

        # Calculate exact match
        t_em = compute_em(predictions, targets)
        print("EM for this batch: {}".format(t_em))
        t_avg_em += t_em

# Print average F1 score and exact match
print("Average Test F1: {}".format(t_avg_f1/t_count))
print("Average Test EM: {}".format(t_avg_em/t_count))


In [None]:
# Write f1 scores from training and validation into a csv
with open('some.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(zip(f1_val, f1_test)) 

# Bi-LSTM

Define Bi-LSTM model

In [None]:
# Simple 2-Layer BiLSTM
class BiLSTM(nn.Module):
  def __init__(self, n_tokens, emb_dim, h_dim, l_in, out_dim):
    super().__init__()
    self.embedding = nn.Embedding(n_tokens, emb_dim, padding_idx=0)
    self.rnn = nn.LSTM(emb_dim,h_dim,bidirectional = True,batch_first=True,num_layers=2,dropout=0.5)
    self.fc = nn.Linear(h_dim*2*l_in, out_dim)

  def forward(self, x):
    x = self.embedding(x)
    out = F.dropout(x,p=0.2)
    out, hidden = self.rnn(x)
    out = torch.flatten(out,start_dim=1,end_dim=2)
    out = self.fc(out)
    out = F.dropout(out,p=0.5)
    return out

Format data for this model

In [None]:
#Dataset class
class LSTMDataset(Dataset):
  #Take in raw data, convert to vectorized utterances and encoded labels
  def __init__(self, data):
    self.data = data

  def __getitem__(self,i):
    inputs = self.data[i][0]
    labels = self.data[i][1]
    return [inputs,labels]

  def __len__(self):
    return len(self.data)

In [None]:
# Create inputs for bi-LSTM model (slightly different format than for BERT as we aren't using the BERT tokenizer)
x_train['inputs'] = x_train['context'] + ' <sep> ' + x_train['question']
x_valid['inputs'] = x_valid['context'] + ' <sep> ' + x_valid['question']
x_test['inputs'] = x_test['context'] + ' <sep> ' + x_test['question']

x_train_lst = list(x_train['inputs'])
x_valid_lst = list(x_valid['inputs'])
x_test_lst = list(x_test['inputs'])

# Get lists of contexts for each split
context_trn = list(x_train['context'])
context_valid = list(x_valid['context'])
context_tst = list(x_test['context'])

# Get lists of start positions
train_start = list(y_train['start_pos'])
valid_start = list(y_valid['start_pos'])
test_start = list(y_test['start_pos'])

# Get lists of end positions
train_end = list(y_train['end_pos'])
valid_end = list(y_valid['end_pos'])
test_end = list(y_test['end_pos'])

# Use vectorizer to get vocab keys
vectorizer = CountVectorizer()
vectorizer.fit(x_train_lst)

# Make vocab
vocab = dict()
vocab['none'] = 0
i = 1
for key in vectorizer.vocabulary_.keys():
  vocab[key] = int(i)
  i += 1
vocab['<unk>'] = i
vocab_size = len(vocab)

In [None]:
# Return padded utterance with each word encoded as the correct index from the vocab
# Default: <unk>
def encode_tokens(x,vocab,max_utterance_len):
  tokens = x.split()
  encoded = torch.zeros(max_utterance_len)
  count = 0
  for i in range(len(tokens)):
    # Truncate utterances longer than max length parameter
    if count == max_utterance_len:
      break
    if tokens[i] in vocab.keys():
        encoded[i] = vocab.get(tokens[i],vocab['<unk>'])
    else:
        encoded[i] = vocab.get('<unk>')
    count += 1
  return encoded.type(torch.LongTensor)

In [None]:
# Set max length of utterances to 1000
max_len = 1500
      
# Encode input for bi-LSTM model
encoded_train = [encode_tokens(x,vocab,max_len) for x in x_train_lst]
encoded_valid = [encode_tokens(x,vocab,max_len) for x in x_valid_lst]
encoded_test = [encode_tokens(x,vocab,max_len) for x in x_test_lst]

In [None]:
# Convert start and end targets for each batch into tensor for training, validation and testing (using one tenth of the train data)
train_targets = torch.stack((torch.stack(train_start),torch.stack(train_end)),dim=1)
valid_targets = torch.stack((torch.stack(valid_start),torch.stack(valid_end)),dim=1)
test_targets = torch.stack((torch.stack(test_start),torch.stack(test_end)),dim=1)

In [None]:
# Make datasets for each split
lstm_train_dataset = LSTMDataset(list(zip(encoded_train,train_targets)))
lstm_valid_dataset = LSTMDataset(list(zip(encoded_valid,valid_targets)))
lstm_test_dataset = LSTMDataset(list(zip(encoded_test,test_targets)))

# Create dataloaders
lstm_train_dataloader = DataLoader(dataset=lstm_train_dataset, batch_size=64, shuffle=True)
lstm_valid_dataloader = DataLoader(dataset=lstm_valid_dataset, batch_size=64, shuffle=False)
lstm_test_dataloader = DataLoader(dataset=lstm_test_dataset, batch_size=64, shuffle=False)

Define functions for evaluation metrics

In [None]:
# Calculate F1 Score
def compute_f1(predictions, targets):
  avg_f1 = 0
  for i in range(len(predictions)):
    f1 = 0

    curr_pred = predictions[i]
    curr_targ = targets[i]
    shared_tokens = [x for x in curr_pred.split() if x in curr_targ.split()]

    # if no tokens are the same then f1 = 0
    if len(shared_tokens) == 0:
        f1 = 0
    else:
      precision = len(shared_tokens) / len(curr_pred.split())
      recall = len(shared_tokens) / len(curr_targ.split())
      
      f1 = 2 * (precision * recall) / (precision + recall)
    avg_f1 += f1
  return avg_f1 / len(predictions)

In [None]:
# Compute exact match, ignoring case
def compute_em(predictions,targets):
  em = 0
  for i in range(len(predictions)):
    curr_pred = predictions[i].lower()
    curr_targ = targets[i].lower()

    if curr_pred == curr_targ:
      em += 1

  return round(em / len(predictions), 2)

Define Training Functions

In [None]:
# Train loop for one epoch
def train_one_epoch(model,epoch_ind,loss_fn,optimizer):
    # current_loss tracks loss of this batch
    current_loss = 0.0
    # total_loss tracks loss of this epoch
    total_loss = 0.0 
    for i,data in enumerate(lstm_train_dataloader):
        inputs = data[0]
        targets = data[1]

        # Enable use of GPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = inputs.to(device)
        targets = targets.to(device)
        targets = targets.to(torch.float32)

        # Get model's predictions
        predictions = model(inputs)

        # Compute loss, gradients and update model
        optimizer.zero_grad()

        loss = loss_fn(predictions,targets)
        loss.backward(retain_graph=True)
        optimizer.step()

        # Record loss of every batch
        current_loss = loss.item()

        total_loss += current_loss
    return total_loss / (i+1)

In [None]:
# Training and validation loop
def train_model(model,num_epochs, loss_fn, optimizer):
    timestamp = datetime.now().strftime('%m%d_%H%M')
    epoch_ind = 0
    # best_vloss is set to arbitrarily high number 
    best_vloss = 1000000.0

    # Write training loss and validation loss of each epoch to csv file
    out = open('./lstm_losses.csv', 'w')
    writer = csv.writer(out)
    writer.writerow(["Epoch","Training Loss","Validation Loss"])

    # For each epoch, train model and compare against validation loss
    for i in range(num_epochs):
        model.train(True)
        avg_loss = train_one_epoch(model,i+1,loss_fn, optimizer)
        # Setting model.train(False) freezes the weights for computing valid loss
        model.train(False)

        # Compute validation loss, F1 score, and EM
        avg_vloss = 0.0
        total_vloss = 0.0
        v_avg_f1 = 0
        v_avg_em = 0

        for j,data in enumerate(lstm_valid_dataloader):
            inputs,targets = data

            # Enable use of GPU
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            inputs = inputs.to(device)
            targets = targets.to(device)
            #targets = targets.to(torch.float32)

            # Get model's predictions
            outputs = model(inputs)
          
            # Compute validation loss
            vloss = loss_fn(outputs,targets)
            total_vloss += vloss.item()

            # Separate model output into start and end indices predictions
            splitted = torch.tensor_split(outputs, 2, dim=1)
            start_preds = splitted[0]
            end_preds = splitted[1]
            
            # Convert predicted indices to answer spans
            predictions = []
            for i in range(len(start_preds)):
              answer_start = int(start_preds[i])
              answer_end = int(end_preds[i])
              answer = context_valid[i][answer_start:answer_end]
              predictions.append(answer)
        
            # Convert target indices to answer spans
            targets = []
            for i in range(len(start_preds)):
              answer_start = valid_start[i]
              answer_end = valid_end[i]
              answer = context_valid[i][answer_start:answer_end]
              targets.append(answer)

            # Calculate f1 score
            f1 = compute_f1(predictions, targets)
            v_avg_f1 += f1
            print("F1 for this batch: {}".format(f1))

            # Calculate exact match
            em = compute_em(predictions, targets)
            print("EM for this batch: {}".format(em))
            v_avg_em += em

        #Print relevant statistics about this epoch of training
        print("epoch: {}".format(epoch_ind + 1))

        v_avg_f1 = v_avg_f1 / (j + 1)
        v_avg_em = v_avg_em / (j + 1)
        print('Average F1 score: {}'.format(v_avg_f1))
        print('Average EM: {}'.format(v_avg_em))
            
        avg_vloss = total_vloss/(j+1)
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        
        # Recording average training and validation loss in csv file
        writer.writerow([epoch_ind + 1,avg_loss,avg_vloss])

        # Save the model's state from the epoch with lowest validation loss
        if (avg_vloss < best_vloss):
            best_vloss = avg_vloss
            model_path = 'model_{}_{}_{}'.format(type(model).__name__, epoch_ind, timestamp)
            torch.save(model.state_dict(), model_path)
        epoch_ind += 1
    out.close
    return model_path

Define Testing Function

In [None]:
# Testing loop
def test_model(model):
    t_avg_f1 = 0
    t_avg_em = 0
    with torch.no_grad():
      for i,data in enumerate(lstm_test_dataloader):
          inputs,targets = data

          # Enable use of GPU
          device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
          inputs = inputs.to(device)

          # Get model output
          outputs = model(inputs)

          # Separate model output into start and end indices predictions
          splitted = torch.tensor_split(outputs, 2, dim=1)
          start_preds = splitted[0]
          end_preds = splitted[1]

          # Convert predicted indices to answer spans
          predictions = []
          for i in range(len(start_preds)):
            answer_start = int(start_preds[i])
            answer_end = int(end_preds[i])
            answer = context_tst[i][answer_start:answer_end]
            predictions.append(answer)
          
          # Convert target indices to answer spans
          targets = []
          for i in range(len(start_preds)):
            answer_start = test_start[i]
            answer_end = test_end[i]
            answer = context_tst[i][answer_start:answer_end]
            targets.append(answer)

          # Calculate f1
          f1 = compute_f1(predictions, targets)
          t_avg_f1 += f1
          print("F1 for this batch: {}".format(f1))

          # Calculate exact match
          em = compute_em(predictions, targets)
          print("EM for this batch: {}".format(em))
          t_avg_em += em

    # Print relevant statistics
    t_avg_f1 = t_avg_f1 / (i + 1)
    t_avg_em = t_avg_em / (i + 1)
    print('Average F1 score: {}'.format(t_avg_f1))
    print('Average EM: {}'.format(t_avg_em))

Train and Evaluate the LSTM model

In [None]:
# Init training parameters
model = BiLSTM(vocab_size+1,200,200,max_len,2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 25

In [None]:
# Train model
model_path = train_model(model,num_epochs,loss_fn,optimizer)

In [None]:
# Load trained model
model.load_state_dict(torch.load(model_path))
model.eval()

# Evaluate model
test_model(model)