In [1]:
import torch
from pytorch_metric_learning import losses
import data_handler
from siamese_network import SiameseNetwork, train
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from custom_losses import ContrastiveLoss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
torch.zeros(1).cuda()
#print(f"torch version: {torch.__version__}")

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"torch cuda available: {torch.cuda.is_available()}")

torch cuda available: True


In [4]:
df_data, _ = data_handler.load(path="dataset/", filename_train="train.csv", sep_char='#')

In [5]:
df_train, df_val = data_handler.split_train_data(df_data, perc_split=0.8)

zero_train:  13100
one_train:  3408
zero_val:  3275
one_val:  852


In [6]:
df_train = data_handler.concatenate_topics(df_train)
df_val = data_handler.concatenate_topics(df_val)

In [7]:
tokenized = data_handler.tokenize_df(df_train[:100], BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
model = SiameseNetwork(bert_type=BertModel.from_pretrained('bert-base-uncased'))

train_loader = DataLoader(tokenized, shuffle=False, batch_size=32)

#train_loss = ContrastiveLoss()
train_loss = losses.ContrastiveLoss()

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4

# The BERT authors recommend between 2 and 4.
epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
for epoch in range(1, epochs + 1):
    encoding = train(model, None, train_loader, ContrastiveLoss, optimizer, epoch, scheduler)
    #test(model, device, test_loader)


Train Epoch: 1 batch: 0 / 100 loss: -16.554628372192383


In [None]:
loss_func = ContrastiveLoss

In [141]:
from torch import nn
import torch.nn.functional

class ContrastiveLoss(nn.Module):
    """
    Contrastive loss
    Takes embeddings of two samples and a target label == 1 if samples are from the same class and label == 0 otherwise
    """

    def __init__(self, margin):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.eps = 1e-9

    def forward(self, output1, output2, target, size_average=True):
        distances = (output2 - output1).pow(2).sum(1)  # squared distances
        losses = 0.5 * (torch.matmul(labels.float(), d) +
                        torch.matmul((1 + -1 * target).float(), torch.nn.functional.relu(self.margin - (distances + self.eps).sqrt()).pow(2)))
        return losses.mean() if size_average else losses.sum()

In [142]:
contrastive = ContrastiveLoss(margin=0.1)

In [143]:
l = contrastive(output1, output2, labels)
l

tensor(0., grad_fn=<MeanBackward0>)

In [179]:
def compute_contrastive_loss(left_feature, right_feature, label, margin):

    """
    Compute the contrastive loss as in


    L = 0.5 * Y * D^2 + 0.5 * (Y-1) * {max(0, margin - D)}^2

    **Parameters**
     left_feature: First element of the pair
     right_feature: Second element of the pair
     label: Label of the pair (0 or 1)
     margin: Contrastive margin

    **Returns**
     Return the loss operation

    """

    label = label.float()
    one = 1.0

    d = compute_euclidean_distance(left_feature, right_feature)
    d_sqrt = torch.sqrt(compute_euclidean_distance(left_feature, right_feature))
    first_part = torch.matmul(one-label, d)# (Y-1)*(d)

    max_margin = torch.maximum(margin-d_sqrt, torch.tensor(0))
    
    max_part = torch.square(max_margin)
    second_part = torch.matmul(label, max_part)  # (Y) * max(margin - d, 0)

    loss = 0.5 * torch.mean(first_part + second_part)

    return loss

def compute_euclidean_distance(x, y):
    """
    Computes the euclidean distance between two tensorflow variables
    """

    d = torch.sum(torch.square(torch.sub(x, y)),1)
    return d

In [180]:
compute_contrastive_loss(output1, output2, labels, 0.1)

tensor(963.6011, grad_fn=<MulBackward0>)

In [274]:
def compute_contrastive_loss(left_feature, right_feature, label, margin):

    """
    Compute the contrastive loss as in


    L = 0.5 * Y * D^2 + 0.5 * (Y-1) * {max(0, margin - D)}^2

    **Parameters**
     left_feature: First element of the pair
     right_feature: Second element of the pair
     label: Label of the pair (0 or 1)
     margin: Contrastive margin

    **Returns**
     Return the loss operation

    """
    
    # -y * log(sim) + (1-y)*log(1-sim)

    label = label.float()
    
    cosine = torch.nn.CosineSimilarity()
    
    sim = torch.mean(torch.square(cosine(left_feature, right_feature))).resize(1)
    one = 1.0
   
    loss = torch.matmul(-label, torch.log(sim)) + torch.matmul((one-label).double(), torch.log(one-sim).double())
    
    return loss

In [265]:
l = compute_contrastive_loss(output1[1], output2[1], labels[1].resize(1), 0.1)

torch.Size([1])
torch.Size([1])


In [267]:
l2 = l

In [268]:
l + l2

tensor(-0.9170, dtype=torch.float64, grad_fn=<AddBackward0>)

In [246]:
l = compute_contrastive_loss(output1, output2, labels, 0.1)

torch.Size([])
torch.Size([32])


RuntimeError: both arguments to matmul need to be at least 1D, but they are 1D and 0D

In [243]:
l.size()

torch.Size([768])

In [231]:
labels[1].size()

torch.Size([])

In [238]:
labels[1].resize(1).float().size()

torch.Size([1])

In [272]:
def ContrastiveLoss(output1, output2, labels):
    
    loss = torch.tensor(0.0)
    
    for i in range(output1.size(0)):
    
        loss += compute_contrastive_loss(output1[i], output2[i], labels[i].resize(1), 0.1)
    
    return loss

In [271]:
output1.size(0)

32

In [275]:
ContrastiveLoss(output1, output2, labels)

tensor(-16.8239, grad_fn=<AddBackward0>)