In [1]:
import pprint
import torch
import json
import os
import sys
import numpy as np

from torch import nn
from torch.nn import functional as F
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA

root_path = os.path.join(os.getcwd(), "..") # WARNING: might need to change
sys.path.append(root_path)

from src.models.conv_model import ConvModel

from src.data_loaders.blogposts import BlogDataset, BlogCollatorFn
from src.data_loaders.pan23 import PAN23Dataset, PAN23CollatorFn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(os.path.join(root_path, "configs", "base-config.json")) as f:
    config = json.load(f)
model_params = config["model_params"]
config["task_dataset_root_dir"] = os.path.join(root_path, config["task_dataset_root_dir"])
config["pretrain_dataset_root_dir"] = os.path.join(root_path, config["pretrain_dataset_root_dir"])
config

{'model_params': {'conv_layers_params': [{'conv_params': {'in_channels': 32,
     'out_channels': 128,
     'kernel_size': 5,
     'padding': 'same'},
    'dim_feedforward': 128,
    'dropout_params': {'p': 0.1}}],
  'transformer_model': 'roberta-base',
  'projection_head_params': {'dropout_p': 0.1,
   'ff_dim': 256,
   'output_dim': 128}},
 'max_len': 512,
 'pretrain_params': {'batch_size': 64,
  'test_set_ratio': 0.1,
  'steps': 20000,
  'learning_rate': 0.0001,
  'weight_decay': 0.01,
  'unfrozen_layers': 2},
 'pan_train_params': {'batch_size': 8,
  'epochs': 50,
  'lr': 0.0001,
  'weight_decay': 0.01,
  'unfrozen_layers': 2},
 'prefix_file_name': 'conv_transformer_base',
 'out_dir': 'out',
 'task_dataset_root_dir': '/home/pablo/nlp-course/assignment/notebooks/../data/pan23/transformed',
 'pretrain_dataset_root_dir': '/home/pablo/nlp-course/assignment/notebooks/../data/blogposts',
 'device': 'cuda:2'}

In [3]:
device = config.get("device") if torch.cuda.is_available() else "cpu"
device

'cuda:2'

## Easy version

In this version, we will assume that our model will only have to encode one text, and that after encoding the embeddings will be compared through a similarity metric. This is necessary for contrastive pretraining, but not for the supervised part. However, since convolutional layers act locally, they do not provide a comparison between both texts. Thus, we would need to add something to compare separate convolutional embeddings for both texts.

In [None]:
class ConvTransformer(nn.Module):
    def __init__(self, conv_layers_params: dict, transformer_model: str):
        super(ConvTransformer, self).__init__()
        self.transformer_model = AutoModel.from_pretrained(transformer_model)
        pretrained_embeddings = self.transformer_model.embeddings.word_embeddings.weight.detach().numpy()
        num_embeddings, transformed_embedding_dim = pretrained_embeddings.shape
        padding_idx = self.transformer_model.embeddings.word_embeddings.padding_idx

        self.conv_model = ConvModel(num_embeddings, padding_idx, conv_layers_params)

        # initialize conv model embeddings with pretrained embeddings through PCA
        conv_embedding_dim = self.conv_model.conv_layers[0].conv.in_channels
        pca = PCA(n_components=conv_embedding_dim)
        conv_init_embedding = pca.fit_transform(pretrained_embeddings)
        conv_init_embedding[padding_idx] = 0.

        self.conv_model.embeddings.weight.data = torch.tensor(conv_init_embedding)

        # store embedding dimension
        self.output_embedding_dim = transformed_embedding_dim + self.conv_model.conv_layers[-1].conv.out_channels

    
    def forward(self, input_ids, attention_mask):
        x_transformed = self.transformer_model(input_ids, attention_mask=attention_mask).pooler_output
        x_conv = self.conv_model(input_ids)
        return torch.cat((x_transformed, x_conv), dim=-1)


In [None]:
conv_transformer = ConvTransformer(model_params["conv_layers_params"], model_params["transformer_model"])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
input_ids = torch.randint(0, 1000, (16, 128))
attention_mask = torch.randint(0, 2, (16, 128))
conv_transformer(input_ids, attention_mask).shape

torch.Size([16, 896])

In [None]:
conv_transformer.output_embedding_dim

896

## Projection head

In [None]:
def add_projection_head(model, input_dim, ff_dim, output_dim):
    return nn.Sequential(
        model,
        nn.Linear(input_dim, ff_dim),
        nn.ReLU(),
        nn.Linear(ff_dim, output_dim)
    )

## NTXent loss

In [None]:
def ntxent_loss(emb_1, emb_2, temperature):
    device = emb_1.device
    batch_size, _ = emb_1.shape

    norm_emb_1, norm_emb_2 = F.normalize(emb_1), F.normalize(emb_2)
    cos_sim = torch.einsum("ax,bx->ab", norm_emb_1, norm_emb_2)
    scaled_cos_sim = cos_sim / temperature

    labels = torch.arange(batch_size).to(device)
    return 0.5 * F.cross_entropy(scaled_cos_sim, labels) + 0.5 * F.cross_entropy(scaled_cos_sim.T, labels)


In [None]:
emb_1, emb_2 = torch.randn(16, 128), torch.randn(16, 128)
temperature = 0.07
ntxent_loss(emb_1, emb_2, temperature)

## Training

In [None]:
model = model_with_projection_head.to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_params["transformer_model"])

In [None]:
pretrain_params = config["pretrain_params"]
pretrain_params

{'batch_size': 64,
 'test_set_ratio': 0.1,
 'epochs': 20,
 'lr': 0.0001,
 'weight_decay': 0.01,
 'unfrozen_layers': 2}

Let's freeze all layers, and defreeze the ones we want to update.

In [None]:
for param in conv_transformer.transformer_model.parameters():
    param.requires_grad = False

layers = conv_transformer.transformer_model.encoder.layer
frozen_layers = len(layers) - pretrain_params["unfrozen_layers"]
for layer in layers[frozen_layers:]:
    for param in layer.parameters():
        param.requires_grad = True

In [None]:
optimizer = torch.optim.AdamW(conv_transformer.parameters(), lr=pretrain_params["lr"], weight_decay=pretrain_params["weight_decay"])

In [None]:
dataset = BlogDataset(config["pretrain_dataset_root_dir"])
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset,
    [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))],
    generator=torch.Generator(device=device)
)

In [None]:
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=pretrain_params["batch_size"],
    shuffle=True,
    collate_fn=BlogCollatorFn(tokenizer, config["max_len"]),
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=pretrain_params["batch_size"],
    shuffle=False,
    collate_fn=BlogCollatorFn(tokenizer, config["max_len"]),
)

In [None]:
def train():
    losses = []
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        emb = conv_transformer(input_ids, attention_mask)
        emb_1, emb_2 = emb[::2], emb[1::2]
        loss = ntxent_loss(emb_1, emb_2, 0.07)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        losses.append(loss.item())
    return np.mean(losses)

In [None]:
def test():
    losses = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            emb = conv_transformer(input_ids, attention_mask)
            emb_1, emb_2 = emb[::2], emb[1::2]
            loss = ntxent_loss(emb_1, emb_2, 0.07)
            
            losses.append(loss.item())
    return np.mean(losses)

In [None]:
for epoch in range(5):
    train_loss = train()
    test_loss = test()
    print(f"Epoch: {epoch}, Train loss: {train_loss}, Test loss: {test_loss}")

## With ContrastivePretrainer

In [4]:
from src.models.conv_transformer_model import ConvTransformer
from src.heads.projection_head import ModelWithProjectionHead
from src.trainers.contrastive_pretrainer import ContrastivePretrainer

In [5]:
model = ConvTransformer(model_params["conv_layers_params"], model_params["transformer_model"])
model_with_proj_head = ModelWithProjectionHead(
    model,
    model.output_embedding_dim,
    **model_params["projection_head_params"],
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_params["transformer_model"])

In [7]:
contrastive_pretrainer_config = {
    **config["pretrain_params"],
    "collator_fn": BlogCollatorFn(tokenizer, config["max_len"]),
    "checkpoint_file": "conv_transformer_pretrained.pt",
    "device": device,
}
contrastive_pretrainer_config

{'batch_size': 64,
 'test_set_ratio': 0.1,
 'steps': 20000,
 'learning_rate': 0.0001,
 'weight_decay': 0.01,
 'unfrozen_layers': 2,
 'collator_fn': <src.data_loaders.blogposts.BlogCollatorFn at 0x7fd892cf4550>,
 'checkpoint_file': 'conv_transformer_pretrained.pt',
 'device': 'cuda:2'}

In [8]:
for param in model.transformer_model.parameters():
    param.requires_grad = False

layers = model.transformer_model.encoder.layer
frozen_layers = len(layers) - contrastive_pretrainer_config["unfrozen_layers"]
for layer in layers[frozen_layers:]:
    for param in layer.parameters():
        param.requires_grad = True

In [9]:
dataset = BlogDataset(config["pretrain_dataset_root_dir"])
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset,
    [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))],
)

In [10]:
pretrainer = ContrastivePretrainer(contrastive_pretrainer_config, model_with_proj_head, train_dataset, test_dataset)

In [11]:
pretrainer.run()

TRAIN - iter_num=100, mean_training_loss=3.3908960700035093, mean_eval_loss=2.5963680045358064, (1.22s)
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/pablo/.micromamba/envs/master-nlp/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3548, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_721625/1797640451.py", line 1, in <module>
    pretrainer.run()
  File "/home/pablo/nlp-course/assignment/notebooks/../src/trainers/contrastive_pretrainer.py", line 106, in run
    iter_time = tnow
  File "/home/pablo/nlp-course/assignment/notebooks/../src/trainers/contrastive_pretrainer.py", line 148, in __mean_eval_loss
  File "/home/pablo/nlp-course/assignment/notebooks/../src/losses/ntxent.py", line 11, in ntxent_loss
    labels = torch.arange(batch_size).to(device)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/pablo/.micromamba/envs/master-nlp/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2142, in showtraceback
    stb = self.