# Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 17.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 53.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=b7e677d225

In [3]:
%cd '/content/drive/MyDrive/Colab Notebooks/NLPGroupCW'

/content/drive/.shortcut-targets-by-id/1qcQshV1qoMFlpNzFTKlWvvjZbTNz5V3t/NLPGroupCW


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, random_split
from transformers import BertTokenizer, BertModel

from dataloaders import *
from processor import *

In [5]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

vocab_size = len(tokenizer.vocab)

max_sentence_length = tokenizer.max_model_input_sizes['bert-base-uncased']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
def tokenize(corpus):
    return [tokenizer.tokenize(sentence) for sentence in corpus]

def to_ids(corpus):
    return [tokenizer.convert_tokens_to_ids(sentence) for sentence in corpus]

In [8]:
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/dev.csv')

training_data = train_df['original']
training_edits = train_df['edit']
test_data = test_df['original']
test_edits = test_df['edit']

training_grades = train_df['meanGrade']

edited_training = pd.Series(create_edited_sentences(training_data, training_edits))
edited_test = pd.Series(create_edited_sentences(test_data, test_edits))

In [9]:
training_tokens = tokenize(edited_training)
testing_tokens = tokenize(edited_test)

training_ids = to_ids(training_tokens)
testing_tokens = to_ids(testing_tokens)

In [10]:
print(training_tokens[100])
print(training_ids[100])

['trump', 'asked', 'du', '##ter', '##te', 'if', 'philippines', 'has', 'death', 'limit', ',', 'philippines', 'ambassador', 'says']
[8398, 2356, 4241, 3334, 2618, 2065, 5137, 2038, 2331, 5787, 1010, 5137, 6059, 2758]


In [11]:
train = Task1Dataset(training_ids, training_grades)
train_dataset, validation_dataset = dataset_split(train)

# Build Model

In [12]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [13]:
class BertGradePredictor(nn.Module):
    def __init__(self, bert_model, total_layers, hid_size, out_size, isBidir, drop):
        super().__init__()
        
        self.bert_model = bert_model

        self.isBidir = isBidir
        
        embed_size = bert_model.config.to_dict()['hidden_size']

        if total_layers < 3:
          drop = 0

        hid_output_size = hid_size
        if isBidir:
          hid_output_size = hid_output_size * 2
        
        self.drop = drop

        self.gru = nn.GRU(input_size=embed_size,
                          hidden_size=hid_size,
                          num_layers=total_layers,
                          bidirectional=isBidir,
                          batch_first=True,
                          dropout=drop)
        
        self.fc1 = nn.Linear(hid_output_size, out_size)
        
        
    def forward(self, x):

        isBidir = self.isBidir
        
        with torch.no_grad():
            x_embed = self.bert_model(x)
            x_embed = x_embed[0]
        
        cell, hid = self.gru(x_embed)
        hid_last = hid[-1,:,:]
        hid_snd_last = hid[-2,:,:]
        
        if isBidir:
            hid = F.dropout(torch.cat((hid_snd_last, hid_last), dim=1), self.drop)
        else:
            hid = F.dropout(hid_last, self.drop)
        
        out = self.fc1(hid)
        
        return out

In [14]:
batch_size = 64
learning_rate = 0.001
total_layers = 3
hid_size = 128
out_size = 1
drop = 0.3
isBidir = True

In [15]:
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse

In [16]:
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch
            feature, target = feature.to(device), target.to(device)
            # for RNN:
            # model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            # model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [17]:
def train(train_loader, validation_loader, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    print("Training model.")
    for epoch in range(1, number_epoch+1):
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far
        for batch in train_loader:
            feature, target = batch
            feature, target = feature.to(device), target.to(device)
            # for RNN:
            # model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            # model.hidden = model.init_hidden()
            predictions = model(feature).squeeze(1)
            optimizer.zero_grad()
            loss = loss_fn(predictions, target)
            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        # valid_loss, valid_mse, _, _ = eval(validation_loader, model)

        # epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        # print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        # Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} |')

In [18]:
loss_fn = nn.MSELoss()
loss_fn = loss_fn.to(device)

In [19]:
total_layers_list = [3, 5]
hid_size_list = [64, 256]
drop_list = [0.2, 0.4]
batch_size_list = [64, 256]
learning_rates = [0.01, 0.001]

epochs = 20
best_batch_size = -1
best_hid_size = -1
best_total_layers = -1
best_drop = -1
best_learning_rate = -1
best_mse = 10000


for batch_size in batch_size_list:
  for hid_size in hid_size_list:
    for total_layers in total_layers_list:
      for drop in drop_list:
        for learning_rate in learning_rates:
          train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=collate_fn_padd)
          validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, collate_fn=collate_fn_padd)

          model = BertGradePredictor(bert_model,
                         total_layers,
                         hid_size,
                         out_size,
                         isBidir,
                         drop)

          model = model.to(device)

          bert_layers = model.named_parameters()
          bert_layers = [(layer, parameter) for layer, parameter in bert_layers]
          for i in range(len(bert_layers)):
              layer_p = bert_layers[i]
              layer = layer_p[0]
              p = layer_p[1]
              if "bert_model" in layer:
                p.requires_grad = False

          optimizer = optim.Adam(model.parameters(), lr=learning_rate)

          train(train_loader, validation_loader, model, epochs)

          _, _, preds, labels = eval(validation_loader, model)

          _, mse = model_performance(preds, labels, print_output=True)

          rmse = np.sqrt(mse)

          print("Current Hyperparameters:")
          print("Batch Size: {}, Hidden Size: {}, Total Layers: {}, Dropout: {}, Learning Rate: {}".format(batch_size, hid_size, total_layers, drop, learning_rate))
          print("MSE: {}, RMSE: {}".format(mse, rmse))

          if mse < best_mse:
            best_mse = mse
            best_batch_size = batch_size
            best_hid_size = hid_size
            best_total_layers = total_layers
            best_drop = drop
            best_learning_rate = learning_rate
            print("Found better hyperparameters...")
            torch.save(model.state_dict(), "./bert.pt")
            
          print()

print("Best Hyperparameters and Metrics")
best_rmse = np.sqrt(best_mse)
print("Batch Size: {}, Hidden Size: {}, Total Layers: {}, Dropout: {}, Learning Rate: {}".format(best_batch_size, best_hid_size, best_total_layers, best_drop, best_learning_rate))
print("MSE: {}, RMSE: {}".format(best_mse, best_rmse))

Training model.
| Epoch: 01 | Train Loss: 0.49 | Train MSE: 0.49 | Train RMSE: 0.70 |
| Epoch: 02 | Train Loss: 0.35 | Train MSE: 0.35 | Train RMSE: 0.60 |
| MSE: 0.34 | RMSE: 0.59 |
Current Hyperparameters:
Batch Size: 64, Hidden Size: 64, Total Layers: 3, Dropout: 0.2, Learning Rate: 0.01
MSE: 0.3449362814426422, RMSE: 0.5873127579689026
Found better hyperparameters...

Training model.
| Epoch: 01 | Train Loss: 0.37 | Train MSE: 0.37 | Train RMSE: 0.61 |
| Epoch: 02 | Train Loss: 0.34 | Train MSE: 0.34 | Train RMSE: 0.59 |
| MSE: 0.33 | RMSE: 0.58 |
Current Hyperparameters:
Batch Size: 64, Hidden Size: 64, Total Layers: 3, Dropout: 0.2, Learning Rate: 0.001
MSE: 0.3308985233306885, RMSE: 0.5752378106117249
Found better hyperparameters...

Training model.
| Epoch: 01 | Train Loss: 0.48 | Train MSE: 0.48 | Train RMSE: 0.69 |


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-227a62b4983b>", line 44, in <module>
    train(train_loader, validation_loader, model, epochs)
  File "<ipython-input-17-2b432e286dc8>", line 21, in train
    sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py

KeyboardInterrupt: ignored