In [None]:
!wget https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d -O files.zip
!unzip files.zip
!pip install transformers

In [None]:
import math
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn.functional as F
from sklearn import preprocessing
from torch import nn

import BERT_utils
import transformers
import utils
from BERT_utils import (get_data_loader, get_data_loader_masked,
                        get_sentence_embeddings, get_tokenized, is_model_new,
                        tokenizer)
from BERThoven_model import BERThoven
from tqdm import tqdm_notebook as tqdm
from train_BERT import check_accuracy, get_test_labels, train_part
from transformers import (AdamW, get_constant_schedule_with_warmup,
                          get_linear_schedule_with_warmup)
from utils import augment_dataset, import_file, pad, remove_outliers

In [None]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
train_df = import_file("train")
dev_df = import_file("dev")
test_df = import_file("test")

quantile_transformer = preprocessing.QuantileTransformer(100)

dataLoader_train = get_data_loader(train_df, batch_size=32, preprocessor=quantile_transformer, fit=True)
dataLoader_dev = get_data_loader(dev_df, batch_size=32, preprocessor=quantile_transformer)
dataLoader_test = get_data_loader(test_df, batch_size=32, test=True, preprocessor=quantile_transformer)


In [None]:
train_df_without_sames = train_df[train_df.src != train_df.mt]
q1 = train_df.scores.quantile(0.25)
q3 = train_df.scores.quantile(0.75)
iqr = q3 - q1

train_df_upsampling = augment_dataset(train_df_without_sames,
                                      lambda score: score < q1,
                                      lambda score: score > q3)
data_loader_train_upsampled = get_data_loader(train_df_upsampling, batch_size=32, preprocessor=quantile_transformer)

train_df_downsampling = train_df_without_sames[train_df_without_sames.scores > q1 - 3 * iqr]
data_loader_train_downsampled = get_data_loader(train_df_downsampling, batch_size=32, preprocessor=quantile_transformer)

In [None]:
x = next(iter(data_loader_train_downsampled))[4]
x.min(), x.max()

In [None]:
train_df[train_df.scores<q1-3*iqr].count()
train_df[train_df.scores>q3+2*iqr].count()

In [None]:
plt.xlim((-8,3))
plt.hist(train_df.scores,bins=40);
plt.show()
plt.xlim((-8,3))
plt.hist(train_df_upsampling.scores,bins=40);
plt.show()

In [None]:
# prod_df = pd.concat([train_df, dev_df])
# prod_df_aug = augment_dataset(prod_df,
#                             lambda score: score<-1,
#                             lambda score: score<-0.3,
#                             lambda score: score>0.55,
#                             lambda score: score>1,
#                             lambda score: score>1.3)
# dataLoader_prod = get_data_loader(prod_df, batch_size=32)
# dataLoader_prod_aug = get_data_loader(prod_df_aug, batch_size=32)

In [None]:
nlp_model = BERThoven(cls=True, dropout=False, concat_outputs=False)
check_accuracy(dataLoader_dev, nlp_model, device=device, preprocessor=quantile_transformer);

epochs = 3
warmup_proportion = 0.1

# loss_function = lambda x, y: F.smooth_l1_loss(x*10, y*10)
loss_function = F.mse_loss
print_every = 60

data_loader_train_upsampled_masked = get_data_loader_masked(train_df_upsampling, batch_size=32,
                                                          preprocessor=quantile_transformer)
# data_loader_train_masked = get_data_loader_masked(train_df, batch_size=32,preprocessor=quantile_transformer)


training_steps = 0
# training_steps += len(data_loader_train_masked)
# training_steps += len(data_loader_train_upsampled)
# training_steps += len(dataLoader_train)
training_steps += len(data_loader_train_downsampled) * 3

warmup_steps = int(training_steps * warmup_proportion)

optimizer = AdamW(nlp_model.parameters(), lr=1e-5, eps=1e-8, correct_bias=True)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps)
# scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, steps_per_epoch, 0.3)
# scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup_steps,training_steps,num_cycles=3)
scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps + 5)

t_losses = []
v_losses = []


def train(dataloader, epochs):
    return train_part(nlp_model, dataloader, optimizer, scheduler, val_loader=dataLoader_dev, epochs=epochs,
                      val_every=3, return_losses=True, preprocessor=quantile_transformer,
                      print_every=print_every, loss_function=loss_function, return_metrics=False, device=device)


# aug_epochs = int(aug_epoch_proportion*epochs)

# train(data_loader_train_upsampled_maked, epochs=3);
train(data_loader_train_downsampled, epochs=3);


# train(dataLoader_train, epochs=1);

In [None]:
def lr_lambda(num_cycles,progress):
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
lr_lambda(2,0.5)

In [None]:
quantile_transformer_prod = preprocessing.QuantileTransformer(1000)

prod_df_without_sames = prod_df[prod_df.src != prod_df.mt]
prod_df_upsampling = augment_dataset(prod_df_without_sames,
                                     lambda score: score < prod_df_without_sames.scores.quantile(0.25),
                                     lambda score: score > prod_df_without_sames.scores.quantile(0.75))

dataLoader_prod = get_data_loader(prod_df, batch_size=32, preprocessor=quantile_transformer_prod, fit=True)
dataLoader_prod_upsampled = get_data_loader(prod_df_upsampling, batch_size=32, preprocessor=quantile_transformer_prod)
# dataLoader_prod_mask = get_data_loader_masked(prod_df_upsampling, batch_size=32, preprocessor=quantile_transformer_prod)

In [None]:
nlp_model = BERThoven(cls=True, dropout=False, concat_outputs=False)
time.sleep(0.1)
check_accuracy(dataLoader_dev, nlp_model, device=device);

epochs = 1
warmup_proportion = 0.1

loss_function = lambda x, y: F.smooth_l1_loss(x * 10, y * 10)
print_every = 60

training_steps = 0
# training_steps += len(dataLoader_prod_mask)
training_steps += len(dataLoader_prod_upsampled)
# training_steps += len(dataLoader_prod)

warmup_steps = int(training_steps * warmup_proportion)

optimizer = AdamW(nlp_model.parameters(), lr=1e-5, eps=1e-8, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps)


# scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=0)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, steps_per_epoch, 0.3)
# scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup_steps,training_steps,num_cycles=2)

# scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, warmup_steps,training_steps*3/2)


def train(dataloader, epochs):
    return train_part(nlp_model, dataloader, optimizer, scheduler, val_loader=dataLoader_dev, epochs=epochs,
                      val_every=3, return_losses=True, preprocessor=quantile_transformer_prod,
                      print_every=print_every, loss_function=loss_function, return_metrics=False, device=device)


# aug_epochs = int(aug_epoch_proportion*epochs)

train(dataLoader_prod, epochs=1);
# train(dataLoader_prod_upsampled, epochs=1);
# train(dataLoader_prod_mask, epochs=1)

In [None]:
torch.save(nlp_model.state_dict(), 'nlp_model_mae800_mse539_p183.pt')

In [None]:
def get_data_loader(dataframe, batch_size=32, test=False):
    ds = BERT_utils.BERTHovenDataset(dataframe, test=test)
    return torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=(not test))


In [None]:
dataLoader_test = get_data_loader(test_df, batch_size=32, test=True)

In [None]:
def writeScores(scores):
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
          output_file.write(f"{x}\n")

In [None]:
scores = get_test_labels(dataLoader_test,nlp_model,device,preprocessor=quantile_transformer)

np.array(scores).describe()

In [None]:
quantile_transformer.inverse_transform([[0.5]]),train_df.median()

### Try using only embeddings

In [None]:
class LinearModel(nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        self.lin_layer1 = nn.Linear(768*2, 1000)
        self.relu1 = nn.LeakyReLU()
        self.lin_layer2 = nn.Linear(1000, 1000)
        self.relu2 = nn.LeakyReLU()
        self.lin_layer3 = nn.Linear(1000, 1)


    def forward(self, x1, x2):
        inp = torch.cat((x1,x2),1)
        out1 = self.relu1(self.lin_layer1(inp))
        out2 = self.relu2(self.lin_layer2(out1))
        out3 = self.lin_layer3(out2)
        return out3.squeeze()

In [None]:
print_every=500
def train_linear_model(model, dataloader, dataloader_val, optimizer, scheduler, epochs=1,max_grad_norm=1.0):

    avg_loss = 1;
    momentum = 0.01;

    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        print(f"Iterations per epoch:{len(dataloader)}")
        for t, ((x1,x2),y) in enumerate(dataloader):
            model.train()  # put model to training mode
            y = y.to(device=device, dtype=torch.float32)

            scores = model(x1,x2)
            
            loss = F.mse_loss(scores, y)
            
            optimizer.zero_grad()
            
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

            scheduler.step()
            l=loss.item()

            avg_loss = l*momentum + avg_loss*(1-momentum)

            if t % print_every == 0:
                print()
                print('Epoch: %d, Iteration %d, loss = %.4f, avg_loss = %.4f' % (e, t, l,avg_loss), end="")
            print(".",end="")
        print()
        print("Avg loss %.3f" % (avg_loss))
        print("Checking accuracy on dev:")
        check_accuracy_linear(dataloader_val, model)
        # print("Saving the model.")
        # torch.save(model.state_dict(), 'nlp_model.pt')

def check_accuracy_linear(loader, model, max_sample_size=None):
    model = model.to(device=device)
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    abs_error = 0
    sqr_error = 0;
    
    with torch.no_grad():
        for (x1, x2), y in loader:
            y = y.to(device=device, dtype=torch.float32)
            scores = model.forward(x1,x2)
            abs_error += (scores - y).abs().sum()
            sqr_error += ((scores - y)**2).sum()
            num_samples += scores.size(0)
            if(max_sample_size!=None and num_samples>=num_samples):
              break;
        mse = sqr_error/num_samples
        mae = abs_error/num_samples
        print('Mean Absolute Error: %.3f, Mean Squared Error %.3f' % (mse, mae))

In [None]:
train_embeddings = get_sentence_embeddings(train_df,bert_model)
val_embeddings = get_sentence_embeddings(dev_df,bert_model)

In [None]:
epochs = 10
warmup_proportion = 0.1

steps_per_epoch = len(train_embeddings)
training_steps = steps_per_epoch*epochs
warmup_steps = int(training_steps*warmup_proportion)

linear_model=LinearModel()

optimizer = AdamW(linear_model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps)

train_linear_model(linear_model, train_embeddings,val_embeddings,optimizer,scheduler, epochs=epochs)