In [None]:
import torch
from pytorch_metric_learning import losses
import data_handler
from siamese_network import SiameseNetwork, train
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from custom_losses import ContrastiveLoss
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
device = torch.device(3)

# Data Management

In [None]:
df_data, _ = data_handler.load(path="dataset/", filename_train="train.csv", sep_char='#')

In [None]:
df_data.info()

In [None]:
df_data['label'].value_counts()

In [None]:
df_data['label'].value_counts().plot.bar(title='Labels Proportions in Dataset')

In [None]:
# Perform a stratified split dividing it into 80% training set and 20% validation set
df_train, df_val = data_handler.split_train_data(df_data, perc_split=0.8)

In [None]:
df_train.info()

In [None]:
df_val.info()

In [None]:
df_train['label'].value_counts().plot.bar(title='Labels Proportions in Training set')

In [None]:
df_val['label'].value_counts().plot.bar(title='Labels Proportions in Validation set')

In [None]:
# Concatenate topics and keypoints, as stated in the paper
df_train = data_handler.concatenate_topics(df_train)
df_val = data_handler.concatenate_topics(df_val)

In [None]:
df_train.info()
print('--------')
df_val.info()

In [None]:
df_train = df_train[12850:13350]
df_train = df_train.reset_index()
#df_train[:100]

In [None]:
n_words = pd.DataFrame()
n_words['words_per_arg_train'] = df_train['args'].str.split().apply(len)
n_words['words_per_arg_val'] = df_val['args'].str.split().apply(len)
n_words['words_per_kp_train'] = df_train['key_points'].str.split().apply(len)
n_words['words_per_kp_val'] = df_val['key_points'].str.split().apply(len)

Considerazioni varie...

In [None]:
n_words.plot.box(figsize=(10, 5))

In [None]:
# Load our model's (bert-base-uncased) tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize data
tokenized = data_handler.tokenize_df(df_train, tokenizer)

In [None]:
print(f'Vocabulary size of tokenizer:', tokenizer.vocab_size, '\nContext size:', tokenizer.model_max_length)

In [None]:
model = SiameseNetwork(bert_type=BertModel.from_pretrained('bert-base-uncased'))

batch_size = 8

train_loader = DataLoader(tokenized, shuffle=True, batch_size=batch_size, pin_memory=True)

#train_loss = ContrastiveLoss()
train_loss = losses.ContrastiveLoss()

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4

# The BERT authors recommend between 2 and 4.
epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
'''
-y  log(^y) + (1 - y)  log(1 - ^y)
where ^y is the cosine similarity of the embeddings,
and y reflects whether a pair matches (1) or not (0).
'''
def paper_contrastive(cosine, label):
    
    #cosine[cosine == 0] = 1e-8
    #cosine[cosine == 1] = 0.99
    
    log_1 = torch.nan_to_num(torch.log(cosine), nan=-1e5)
    log_2 = torch.nan_to_num(torch.log(1-cosine), nan=-1e5)
    
    #print(f'cosine {log_1.shape} {log_2.shape}')
    
    contr = torch.mul((-label).double(), log_1.double())+ \
            torch.mul((1-label).double(), log_2.double())
    return contr

In [None]:
model.to(device)
5+2
#epoch.to(device)
#train_loader.to(device)
#optimizer.to(device)
#scheduler.to(device)

In [None]:
#loss_obj = torch.nn.MSELoss(reduction="none")
loss_obj = torch.nn.MSELoss()

loss_obj.to(device)

model.train()

epoch=1
for batch_idx, (encodings) in enumerate(train_loader):
  #images_1, images_2, targets = images_1.to(device), images_2.to(device), targets.to(device)

  # Extract arguments, key_points and labels all from the same batch
    #args = encodings['arg']
    args = {k:v.to(device) for k,v in encodings['arg'].items()}
    
    #kps = encodings['kp']
    kps = {k:v.to(device) for k,v in encodings['kp'].items()}
    
    labels = encodings['label']
    labels = labels.to(device)
    
    optimizer.zero_grad()
    output1, output2 = model(args, kps)
        
    # AVG of every token
    output1 = torch.mean(output1, 1)
    output2 = torch.mean(output2, 1)
    
    cos = torch.nn.CosineSimilarity()
    cosine_sim = cos(output1, output2)
    
    print(args['input_ids'].shape[0])
    loss = loss_obj(cosine_sim.float(), labels.float())
    #loss = torch.Tensor(args['input_ids'].shape[0], 1)
    #for i in range(0, cosine_sim.shape[0]):
    #    loss[i] = paper_contrastive(cosine_sim[i], labels[i])
    #loss_function(output1, output2, labels)
    print(loss.shape)

    #loss = loss_function(tf.convert_to_tensor(labels.numpy()), tf.convert_to_tensor(outputs.numpy()))
    #loss.mean().backward()
    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are
    # modified based on their gradients, the learning rate, etc.
    optimizer.step()

    # Update the learning rate.
    scheduler.step()

    if batch_idx>-1:
        print(f'Train Epoch:', epoch, 'batch:',
            batch_idx, 'loss:',
            loss.mean())
    

In [None]:
test = DataLoader(tokenized, shuffle=True)

In [None]:
with torch.no_grad():
    for batch_idx, (encodings) in enumerate(test):
      #images_1, images_2, targets = images_1.to(device), images_2.to(device), targets.to(device)

      # Extract arguments, key_points and labels all from the same batch
        args = {k:v.to(device) for k,v in encodings['arg'].items()}

        #kps = encodings['kp']
        kps = {k:v.to(device) for k,v in encodings['kp'].items()}

        labels = encodings['label']
        labels = labels.to(device)

        optimizer.zero_grad()
        output1, output2 = model(args, kps)

        # AVG of every token
        output1 = torch.mean(output1, 1)
        output2 = torch.mean(output2, 1)

        cos = torch.nn.CosineSimilarity()
        cosine_sim = cos(output1, output2)
        print(f'cosine {cosine_sim} == {labels}')

        loss = torch.Tensor(args['input_ids'].shape[0], 1)
        for i in range(0, cosine_sim.shape[0]):
            loss[i] = loss_obj(cosine_sim[i], labels[i])
        #loss_function(output1, output2, labels)
        print(f'loss: {loss}')

In [None]:
for epoch in range(1, epochs + 1):
    train(model, None, train_loader, ContrastiveLoss, optimizer, epoch, scheduler)
    #test(model, device, test_loader)


In [None]:
def ContrastiveLoss(output1, output2, labels):
    
    loss = torch.tensor(0.0)
    
    for i in range(output1.size(0)):
    
        loss += compute_contrastive_loss(output1[i], output2[i], labels[i].resize(1), 0.1)
        print(loss)
    
    return loss