In [1]:
import torch
import sys
import json
import os

from transformers import AutoTokenizer

root_path = os.path.join(os.getcwd(), "..") # WARNING: might need to change
src_path = os.path.join(root_path, "src")
sys.path.append(src_path)

from models.conv_transformer_model import ConvTransformer
from heads.classification_head import ModelWithClassificationHead
from trainers.classification_trainer import ClassificationTrainer
from data_loaders.pan23 import PAN23Dataset, PAN23CollatorFn


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(os.path.join(root_path, "configs", "base-config.json")) as f:
    config = json.load(f)
model_params = config["model_params"]
config["task_dataset_root_dir"] = os.path.join(root_path, config["task_dataset_root_dir"])
config["pretrain_dataset_root_dir"] = os.path.join(root_path, config["pretrain_dataset_root_dir"])
config

{'model_params': {'conv_layers_params': [{'conv_params': {'in_channels': 32,
     'out_channels': 128,
     'kernel_size': 5,
     'padding': 'same'},
    'dim_feedforward': 128,
    'dropout_params': {'p': 0.1}}],
  'transformer_model': 'roberta-base',
  'projection_head_params': {'dropout_p': 0.1,
   'ff_dim': 256,
   'output_dim': 128},
  'classification_head_params': {'dropout_p': 0.1, 'ff_dim': 256}},
 'max_len': 512,
 'pretrain_params': {'batch_size': 64,
  'test_set_ratio': 0.1,
  'steps': 20000,
  'learning_rate': 0.0001,
  'unfrozen_layers': 2},
 'pan_train_params': {'batch_size': 16,
  'steps': 10000,
  'lr': 0.0001,
  'unfrozen_layers': 2},
 'prefix_file_name': 'conv_transformer_base',
 'out_dir': 'out',
 'task_dataset_root_dir': '/home/pablo/nlp-course/assignment/notebooks/../data/pan23/transformed',
 'pretrain_dataset_root_dir': '/home/pablo/nlp-course/assignment/notebooks/../data/blogposts',
 'device': 'cuda:2'}

In [3]:
device = config.get("device") if torch.cuda.is_available() else "cpu"
device

'cuda:2'

In [4]:
model = ConvTransformer(model_params["conv_layers_params"], model_params["transformer_model"])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model_with_classification_head = ModelWithClassificationHead(
    model=model, 
    input_dim=model.output_embedding_dim,
    **model_params["classification_head_params"]
)

In [6]:
train_datasets = []
test_datasets = []

for task in range(1, 4):
    train_datasets.append(
        PAN23Dataset(os.path.join(config["task_dataset_root_dir"], f"pan23-task{task}-train"))
    )
    test_datasets.append(
        PAN23Dataset(os.path.join(config["task_dataset_root_dir"], f"pan23-task{task}-validation"))
    )

In [7]:
train_dataset = torch.utils.data.ConcatDataset(train_datasets)

In [8]:
len(train_dataset), [len(test_dataset) for test_dataset in test_datasets]

(60233, [2828, 7042, 4112])

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_params["transformer_model"])

In [10]:
trainer_config = {
    **config["pan_train_params"],
    "collator_fn": PAN23CollatorFn(tokenizer, config["max_len"]),
    "checkpoint_file": "conv_transformer_pretrained.pt",
    "device": device,
}
trainer_config

{'batch_size': 16,
 'steps': 10000,
 'lr': 0.0001,
 'unfrozen_layers': 2,
 'collator_fn': <data_loaders.pan23.PAN23CollatorFn at 0x7f246ce90340>,
 'checkpoint_file': 'conv_transformer_pretrained.pt',
 'device': 'cuda:2'}

In [11]:
def freeze_layers(transformer_model, num_unfrozen_layers):
    for param in transformer_model.parameters():
        param.requires_grad = False

    layers = transformer_model.encoder.layer
    frozen_layers = len(layers) - num_unfrozen_layers
    for layer in layers[frozen_layers:]:
        for param in layer.parameters():
            param.requires_grad = True

In [12]:
freeze_layers(model.transformer_model, trainer_config["unfrozen_layers"])

In [13]:
trainer = ClassificationTrainer(trainer_config, model_with_classification_head, train_dataset, test_datasets)

In [14]:
trainer.run()

In [8]:
import numpy as np

from sklearn.metrics import confusion_matrix, f1_score

In [12]:
y_true = np.array([0, 1, 0, 0, 0, 0])
y_pred = np.array([0, 1, 0, 1, 1, 0])

matrix = confusion_matrix(y_true, y_pred)
matrix

array([[3, 2],
       [0, 1]])

In [13]:
TN, FP, FN, TP = matrix.ravel()
    
# Calculating Precision and Recall
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0

# Calculating F1 Score
F1_Score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

F1_Score

0.5

In [14]:
f1_score(y_true, y_pred)

0.5

In [15]:
matrix.ravel()

array([3, 2, 0, 1])