In [None]:
!pip install pytorch-transformers

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import *
import json
import numpy as np
import os
import string
import copy
import random
from google.colab import drive
drive.mount('/data', force_remount=True)
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Mounted at /data
cuda


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

#find position of a word in a tokenized sentence
def position(w, ids):
  token=[]
  #encode word
  wtk = tokenizer.encode(w)
  if len(wtk) > 1: #if multiple tokens assign first one
    token = [wtk[0]]
  else:
    token = wtk
  for i in range(len(ids)):
    if ids[i:i+1] == token:
      return i
  return -1


#find positions of both words in a tokenized sentence
def word_positions(wL,ids):
  posL=[]
  i=0
  for w in wL:
    pos = position(w,ids[i:])
    posL.append(pos+i)
    i = posL[-1] + 1
  return posL


#read all data, store individual elements, tokenize each sentence
def read_and_tokenize(d):
  sentences=[]
  tk_sentences=[]
  words=[]
  idxs=[]
  sentences1=[]
  sentences2=[]
  labels=[]
  starts1=[]
  starts2=[]
  ends1=[]
  ends2=[]
  for i, line in enumerate(d):
    #store each element line-by-line in a list
    words.append(line['word'])
    idxs.append(i)
    sentences1.append(line['sentence1'])
    sentences2.append(line['sentence2'])
    if line['label']:
      labels.append(1)
    else:
      labels.append(0)
    starts1.append(line['start1'])
    starts2.append(line['start2'])
    ends1.append(line['end1'])
    ends2.append(line['end2'])
    #combine both sentences into a single one and store in a list
    s = "<s>"+line['sentence1']+"</s><s>"+line['sentence2']+"</s>"
    sentences.append(s)
    #tokenize and store in a list
    tk = tokenizer.encode(s)
    tk_sentences.append(tk)
  #find the max length of tokenized combined sentences
  max_len = len(max(tk_sentences, key=len))
  return max_len, sentences, tk_sentences, words, idxs, sentences1, sentences2, labels, starts1, starts2, ends1, ends2


#build one hot encoded word location vectors
def build_word_loc_vectors(sentences1, sentences2, starts1, starts2, ends1, ends2, tk_sentences, max_len):
  word1_locs_vector=[]
  word2_locs_vector=[]
  for s1, s2, st1, st2, e1, e2, tk in zip(sentences1, sentences2, starts1, starts2, ends1, ends2, tk_sentences):
    #split sentences into individual words
    s1_words = s1.split(' ')
    s2_words = s2.split(' ')
    # Find indexes of 'word' location by skipping one word at a time until 'start1' and 'start2' location is reached
    idx = 0
    loc1 = 0
    for k in range(0, len(s1_words)): #sentence1
      idx = idx+len(s1_words[k])+1
      if idx >= st1 and idx < e1:
        loc1=k+1
    idx = 0
    loc2 = 0
    for k in range(0, len(s2_words)): #sentence2
      idx = idx+len(s2_words[k])+1
      if idx >= st2 and idx < e2:
        loc2=k+1
    # Strip punctuation
    tb = str.maketrans('', '', string.punctuation)
    w1p = s1_words[loc1]
    w2p = s2_words[loc2]
    w1 = w1p.translate(tb)
    w2 = w2p.translate(tb)
    # Find word positions in a tokenized sentence
    word_pos = word_positions([w1, w2], tk)
    # Construct one hot encoded location vectors
    word1_loc = []
    word2_loc = []
    for j in range(0, max_len):
      word1_loc.append(0.0)
      word2_loc.append(0.0)
    word1_loc[word_pos[0]] = 1.0
    word2_loc[word_pos[1]] = 1.0
    #store in a list
    word1_locs_vector.append([word1_loc])
    word2_locs_vector.append([word2_loc])
  return word1_locs_vector, word2_locs_vector


#pad tokenized sentences to max_length and build their masks
def build_tk_sentence_mask(tk_sentences, max_len):
  tokenized_sentence_mask=[]
  tokenized_sentence=[]
  for tk in tk_sentences:
    # Pad with 0's to max_len
    tkp = tk.copy()
    zeros = [0] * (max_len - len(tk))
    tkp.extend(zeros)
    tokenized_sentence.append(tkp)
    #build binary mask
    mask = [1] * len(tk)
    mask.extend(zeros)
    tokenized_sentence_mask.append(mask)
  return tokenized_sentence, tokenized_sentence_mask


#build sentence location binary mask
def build_sentence_loc_mask(tk_sentences, max_len):
  sentence_loc_mask=[]
  for tk in tk_sentences:
    idx=0
    mask=[]
    #find index for second 0 in tokenized input indicating beginning of sentence2 (skip first 0 at the beginning of sentence1)
    for i in range(1, len(tk)):
      if tk[i] == 0:
        idx = i
    #build binary mask
    for i in range(0, idx):
      mask.append(0)
    for i in range(idx+1,len(tk)+1):
      mask.append(1)
    for i in range(len(tk)+2, max_len+2):
      mask.append(0)
    sentence_loc_mask.append(mask)
  return sentence_loc_mask




100%|██████████| 898823/898823 [00:00<00:00, 4714188.95B/s]
100%|██████████| 456318/456318 [00:00<00:00, 2670253.74B/s]


In [None]:
#load training data
train=[]
with open('/data/My Drive/data/train.jsonl', 'r') as f:
  for line in f:
    train.append(json.loads(line))

# Read and tokenize
train_max_len, train_sentences, train_tk_sentences, train_words, train_idxs, train_sentences1, train_sentences2, train_labels, train_starts1, train_starts2, train_ends1, train_ends2 = read_and_tokenize(train)

# Build one hot encoded word location vectors
train_word1_locs_vector, train_word2_locs_vector = build_word_loc_vectors(train_sentences1, train_sentences2, train_starts1, train_starts2, train_ends1, train_ends2, train_tk_sentences, train_max_len)

#pad tokenized sentences to max_length and build their masks
train_tokenized_sentence, train_tokenized_sentence_mask = build_tk_sentence_mask(train_tk_sentences, train_max_len)

#build sentence binary mask
train_sentence_loc_mask = build_sentence_loc_mask(train_tk_sentences, train_max_len)

# Tensorize and make tensor dataset, sampler and loader
train_t1 = torch.tensor(train_tokenized_sentence)
train_t2 = torch.tensor(train_sentence_loc_mask)
train_t3 = torch.tensor(train_tokenized_sentence_mask)
train_t4 = torch.tensor(train_labels)
train_t5 = torch.tensor(train_word1_locs_vector)
train_t6 = torch.tensor(train_word2_locs_vector)
train_t7 = torch.tensor(train_idxs)
train_data = TensorDataset(train_t1,train_t2,train_t3,train_t4,train_t5,train_t6,train_t7)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

#load dev data
dev=[]
with open('/data/My Drive/data/dev.jsonl', 'r') as f:
  for line in f:
    dev.append(json.loads(line))

# Read and tokenize
dev_max_len, dev_sentences, dev_tk_sentences, dev_words, dev_idxs, dev_sentences1, dev_sentences2, dev_labels, dev_starts1, dev_starts2, dev_ends1, dev_ends2 = read_and_tokenize(dev)

# Build one hot encoded word location vectors
dev_word1_locs_vector, dev_word2_locs_vector = build_word_loc_vectors(dev_sentences1, dev_sentences2, dev_starts1, dev_starts2, dev_ends1, dev_ends2, dev_tk_sentences, dev_max_len)

#pad tokenized sentences to max_length and build their masks
dev_tokenized_sentence, dev_tokenized_sentence_mask = build_tk_sentence_mask(dev_tk_sentences, dev_max_len)

#build sentence binary mask
dev_sentence_loc_mask = build_sentence_loc_mask(dev_tk_sentences, dev_max_len)

# Tensorize and make tensor dataset, sampler and loader
dev_t1 = torch.tensor(dev_tokenized_sentence)
dev_t2 = torch.tensor(dev_sentence_loc_mask)
dev_t3 = torch.tensor(dev_tokenized_sentence_mask)
dev_t4 = torch.tensor(dev_labels)
dev_t5 = torch.tensor(dev_word1_locs_vector)
dev_t6 = torch.tensor(dev_word2_locs_vector)
dev_t7 = torch.tensor(dev_idxs)
dev_data = TensorDataset(dev_t1,dev_t2,dev_t3,dev_t4,dev_t5,dev_t6,dev_t7)
dev_sampler = RandomSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=BATCH_SIZE)

#load test data
test=[]
with open('/data/My Drive/data/test.jsonl', 'r') as f:
  for line in f:
    test.append(json.loads(line))

# Read and tokenize
test_max_len, test_sentences, test_tk_sentences, test_words, test_idxs, test_sentences1, test_sentences2, test_labels, test_starts1, test_starts2, test_ends1, test_ends2 = read_and_tokenize(test)

# Build one hot encoded word location vectors
test_word1_locs_vector, test_word2_locs_vector = build_word_loc_vectors(test_sentences1, test_sentences2, test_starts1, test_starts2, test_ends1, test_ends2, test_tk_sentences, test_max_len)

#pad tokenized sentences to max_length and build their masks
test_tokenized_sentence, test_tokenized_sentence_mask = build_tk_sentence_mask(test_tk_sentences, test_max_len)

#build sentence binary mask
test_sentence_loc_mask = build_sentence_loc_mask(test_tk_sentences, test_max_len)

# Tensorize and make tensor dataset, sampler and loader
test_t1 = torch.tensor(test_tokenized_sentence)
test_t2 = torch.tensor(test_sentence_loc_mask)
test_t3 = torch.tensor(test_tokenized_sentence_mask)
test_t4 = torch.tensor(test_labels)
test_t5 = torch.tensor(test_word1_locs_vector)
test_t6 = torch.tensor(test_word2_locs_vector)
test_t7 = torch.tensor(test_idxs)
test_data = TensorDataset(test_t1,test_t2,test_t3,test_t4,test_t5,test_t6,test_t7)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

#Train, validate and test Roberta
def train_val_test_roberta(model, epochs):

  #AdamW optimizer
  optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

  #Train and validate
  epoch_number = 0
  while epoch_number < epochs:
    print("Epoch Number: ", epoch_number)
    epoch_number = epoch_number + 1

    # Training with train set
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      # Load the data
      tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
      # Zero the gradients
      optimizer.zero_grad()
      # Forward pass
      loss, logits = model(tokenized_sentence, token_type_ids=None, attention_mask=tokenized_sentence_mask, labels=labels)
      # Backward pass
      loss.backward()
      # Optimizer step
      optimizer.step()
      tr_loss += loss.item()
      nb_tr_steps += 1
    print("Train loss: ", tr_loss/nb_tr_steps)

    # Validation with dev set
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    nb_eval_examples = 0
    for batch in dev_dataloader:
      batch = tuple(t.to(device) for t in batch)
      # Load the data
      #b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index
      tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
      # Do not to compute or store gradients
      with torch.no_grad():
        # Forward pass
        (loss, logits) = model(tokenized_sentence, token_type_ids=None, attention_mask=tokenized_sentence_mask, labels=labels)
      # Move to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()
      # Compare output to labels
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1
    print("Validation Accuracy: ", eval_accuracy/nb_eval_steps)


  # Test test with test set once training and validation loop is completed
  eval_loss = 0
  eval_accuracy = 0
  nb_eval_steps = 0
  model.eval()
  for batch in test_dataloader:
    batch = tuple(t.cuda() for t in batch)
    # Load the data
    tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
    # Do not to compute or store gradients
    with torch.no_grad():
      # Forward pass
      (loss, logits) = model(tokenized_sentence, token_type_ids=None, attention_mask=tokenized_sentence_mask, labels=labels)
    # Move to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = labels.cpu().numpy()
    # Compare output to labels
    b_accuracy = flat_accuracy(logits, label_ids)
    eval_loss += loss.item()
    eval_accuracy += b_accuracy
    nb_eval_steps += 1
  print("Test: \tLoss = ", eval_loss/nb_eval_steps, "\tAccuracy = ", eval_accuracy/nb_eval_steps)

In [None]:
#train the top layer of RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model = model.cuda()
train_val_test_roberta(model, 2)
#train_val_test_roberta(model, 3)



Epoch Number:  0
Train loss:  0.6768080426167838
Validation Accuracy:  0.6651785714285714
Epoch Number:  1
Train loss:  0.5578582283062271
Validation Accuracy:  0.7489853896103896
Test: 	Loss =  0.6018227159976959 	Accuracy =  0.7034274193548387


In [None]:
#add a tunable fully connected tunable top on top of roberta
class TunableTop1HL(torch.nn.Module):
    def __init__(self, basemodel, D_in=768, H=250, D_out=2):
        #Fully-connected network with one hidden layer
        super(TunableTop1HL, self).__init__()
        self.basemodel = basemodel
        self.D_in = D_in
        self.H = H
        self.D_out = D_out
        self.linear1 = torch.nn.Linear(D_in, H, bias = True)
        self.linear2 = torch.nn.Linear(H, D_out, bias = True)
        #experiment with different loss and activation functions
        self.loss_CrossEntropy = torch.nn.CrossEntropyLoss()
        self.loss_MSE = torch.nn.MSELoss()
        self.activation_ReLU = torch.nn.ReLU()
        self.activation_GELU = torch.nn.GELU()
        self.activation_tan = torch.nn.Tanh()
        self.activation_sigmoid = torch.nn.Sigmoid()
        self.activation_softmax = torch.nn.Softmax()

    def forward(self, tokenized_sentence, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector):
        output, _ = self.basemodel.roberta(input_ids=tokenized_sentence, attention_mask=tokenized_sentence_mask)
        size = word1_locs_vector.shape[0]
        w1 = torch.matmul(word1_locs_vector, output).view(size, self.D_in)
        w2 = torch.matmul(word2_locs_vector, output).view(size, self.D_in)
        logits1 = self.activation_ReLU(self.linear1(w1-w2))
        logits2 = self.activation_softmax(self.linear2(logits1))
        loss = self.loss_CrossEntropy(logits2.view(-1, 2), labels.view(-1))
        return (loss, logits2)

class TunableTop2HL(torch.nn.Module):
    def __init__(self, basemodel, D_in=768, H1=30, H2=10, D_out=2):
        #Fully-connected network with two hidden layers
        super(TunableTop2HL, self).__init__()
        self.basemodel = basemodel
        self.D_in = D_in
        self.H1 = H1
        self.H2 = H2
        self.D_out = D_out
        self.linear1 = torch.nn.Linear(D_in, H1, bias = True)
        self.linear2 = torch.nn.Linear(H1, H2, bias = True)
        self.linear3 = torch.nn.Linear(H2, D_out, bias = True)
        #experiment with different loss and activation functions
        self.loss_CrossEntropy = torch.nn.CrossEntropyLoss()
        self.loss_MSE = torch.nn.MSELoss()
        self.activation_ReLU = torch.nn.ReLU()
        self.activation_GELU = torch.nn.GELU()
        self.activation_tan = torch.nn.Tanh()
        self.activation_sigmoid = torch.nn.Sigmoid()
        self.activation_softmax = torch.nn.Softmax()

    def forward(self, tokenized_sentence, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector):
        output, _ = self.basemodel.roberta(input_ids=tokenized_sentence, attention_mask=tokenized_sentence_mask)
        size = word1_locs_vector.shape[0]
        w1 = torch.matmul(word1_locs_vector, output).view(size, self.D_in)
        w2 = torch.matmul(word2_locs_vector, output).view(size, self.D_in)
        logits1 = self.activation_ReLU(self.linear1(w1-w2))
        logits2 = self.activation_ReLU(self.linear2(logits1))
        logits3 = self.activation_softmax(self.linear3(logits2))
        loss = self.loss_CrossEntropy(logits3.view(-1, 2), labels.view(-1))
        return (loss, logits3)

In [None]:
#Train, validate and test custome model on top of Roberta
def train_val_test_custom(model, epochs):

  #AdamW optimizer
  optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

  #Train and validate
  epoch_number = 0
  while epoch_number < epochs:
    print("Epoch Number: ", epoch_number)
    epoch_number = epoch_number + 1

    # Training with train set
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      # Load the data
      tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
      # Zero the gradients
      optimizer.zero_grad()
      # Forward pass
      loss, logits = model(tokenized_sentence=tokenized_sentence, tokenized_sentence_mask=tokenized_sentence_mask, labels=labels, word1_locs_vector=word1_locs_vector, word2_locs_vector=word2_locs_vector)
      # Backward pass
      loss.backward()
      # Optimizer step
      optimizer.step()
      tr_loss += loss.item()
      nb_tr_steps += 1
    print("Train loss: ", tr_loss/nb_tr_steps)

    # Validation with dev set
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    nb_eval_examples = 0
    for batch in dev_dataloader:
      batch = tuple(t.to(device) for t in batch)
      # Load the data
      #b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index
      tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
      # Do not to compute or store gradients
      with torch.no_grad():
        # Forward pass
        loss, logits = model(tokenized_sentence=tokenized_sentence, tokenized_sentence_mask=tokenized_sentence_mask, labels=labels, word1_locs_vector=word1_locs_vector, word2_locs_vector=word2_locs_vector)
      # Move to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()
      # Compare output to labels
      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1
    print("Validation Accuracy: ", eval_accuracy/nb_eval_steps)


  # Test test with test set once training and validation loop is completed
  eval_loss = 0
  eval_accuracy = 0
  nb_eval_steps = 0
  model.eval()
  for batch in test_dataloader:
    batch = tuple(t.cuda() for t in batch)
    # Load the data
    tokenized_sentence, sentence_loc_mask, tokenized_sentence_mask, labels, word1_locs_vector, word2_locs_vector, index = batch
    # Do not to compute or store gradients
    with torch.no_grad():
      # Forward pass
      loss, logits = model(tokenized_sentence=tokenized_sentence, tokenized_sentence_mask=tokenized_sentence_mask, labels=labels, word1_locs_vector=word1_locs_vector, word2_locs_vector=word2_locs_vector)
    # Move to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = labels.cpu().numpy()
    # Compare output to labels
    b_accuracy = flat_accuracy(logits, label_ids)
    eval_loss += loss.item()
    eval_accuracy += b_accuracy
    nb_eval_steps += 1
  print("Test: \tLoss = ", eval_loss/nb_eval_steps, "\tAccuracy = ", eval_accuracy/nb_eval_steps)

In [None]:
#RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
basemodel = RobertaForSequenceClassification.from_pretrained('roberta-base')

In [None]:
print("One hidden layer classification head:")
model1 = TunableTop1HL(basemodel, D_in=768, H=250, D_out=2)
model1.cuda()
train_val_test_custom(model1, 10)

One hidden layer classification head:
Epoch Number:  0




Train loss:  0.4036568241028846
Validation Accuracy:  0.7713068181818182
Epoch Number:  1
Train loss:  0.37427076735074005
Validation Accuracy:  0.7702922077922079
Epoch Number:  2
Train loss:  0.3753939983965475
Validation Accuracy:  0.755275974025974
Epoch Number:  3
Train loss:  0.3612583196615871
Validation Accuracy:  0.7587256493506492
Epoch Number:  4
Train loss:  0.3559717279446276
Validation Accuracy:  0.78125
Epoch Number:  5
Train loss:  0.3496341999573044
Validation Accuracy:  0.7676542207792207
Epoch Number:  6
Train loss:  0.3460993970496745
Validation Accuracy:  0.7918019480519481
Epoch Number:  7
Train loss:  0.34648707094071785
Validation Accuracy:  0.7706980519480519
Epoch Number:  8
Train loss:  0.3475825918626182
Validation Accuracy:  0.7698863636363635
Epoch Number:  9
Train loss:  0.3451321079761167
Validation Accuracy:  0.7725243506493508
Test: 	Loss =  0.6037602365016937 	Accuracy =  0.7035786290322581


In [None]:
print("\nTwo hidden layer classification head:")
model2 = TunableTop2HL(basemodel, D_in=768, H1=30, H2=8, D_out=2)
model2.cuda()
train_val_test_custom(model2, 10)


Two hidden layer classification head:
Epoch Number:  0




Train loss:  0.4350385367870331
Validation Accuracy:  0.7834821428571429
Epoch Number:  1
Train loss:  0.3672994167744359
Validation Accuracy:  0.7637987012987013
Epoch Number:  2
Train loss:  0.3591215308708481
Validation Accuracy:  0.7790178571428571
Epoch Number:  3
Train loss:  0.35502799628656123
Validation Accuracy:  0.7650162337662338
Epoch Number:  4
Train loss:  0.35549914346465583
Validation Accuracy:  0.7767857142857143
Epoch Number:  5
Train loss:  0.3554455106016956
Validation Accuracy:  0.7688717532467532
Epoch Number:  6
Train loss:  0.3525361358364926
Validation Accuracy:  0.7778003246753247
Epoch Number:  7
Train loss:  0.34261779241924045
Validation Accuracy:  0.7857142857142857
Epoch Number:  8
Train loss:  0.33851926311661923
Validation Accuracy:  0.7830762987012987
Epoch Number:  9
Train loss:  0.3363152621667596
Validation Accuracy:  0.7790178571428571
Test: 	Loss =  0.6076936542987823 	Accuracy =  0.697429435483871
