In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import numpy as np
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import wandb

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import set_seed

from src.config import Config
from src.dataset import SBICDataset, SBICDatasetInference
from src.utils import PropertyDict
from src.train_utils import *

CONFIG:Config = Config()
wandb.login()

True

In [3]:
config = PropertyDict(
    seed=42,
    checkpoint_name="distilgpt2",
    model_name="distilgpt2",
    padding_side="left",
    batch_size=32,
    val_batch_size=16,
    num_workers=0,
    num_epochs=20,
    learning_rate=5e-4,
    scheduler="linear",
    warmup_fraction=0.1,
    accumulation_steps=1,
    gradient_clip = 1.0,
    mixed_precision="fp16",
    checkpoint_interval=1000,
    log_interval=1000,
    cpu=False
)

## Train

In [4]:
with wandb.init(project=CONFIG.wandbConfig.project, config=config):
    config = wandb.config

    set_seed(CONFIG.seed)

    # Make the model
    tokenizer = make_tokinzer(config)
    model = make_model(config, tokenizer)

    # Make the data
    train_data = get_data("train")[:1024]
    train_dataset = SBICDataset(train_data, tokenizer)

    val_data = get_data("validation")[:1024]
    val_dataset = SBICDataset(train_data, tokenizer)

    train_dataloader = make_dataloader(train_dataset, model, tokenizer, config, split="train")
    val_dataloader = make_dataloader(val_dataset, model, tokenizer, config, split="validation")

    # Make the loss, the optimizer and the scheduler
    optimizer = make_optimizer(model, config)
    scheduler = make_scheduler(
        optimizer, steps_per_epoch=len(train_dataloader), config=config
    )

    # model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, metrics = make(config)
    # print(model)

    train(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        config,
    )

    # results = evaluate(model, tokenizer, train_data, val_data, config)

List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]
Model vocab resize: 50269
Model eos token: 50256
Model pad token: 50257
Model sep token: 50258


  0%|          | 0/640 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 640/640 [02:55<00:00,  3.66it/s]




0,1
lr,▁▃▅▆███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
lr,1e-05
train_loss,0.09683


## Eval

In [4]:
tokenizer = make_tokinzer(config)
model = make_model(config, tokenizer)
model.load_state_dict(torch.load("checkpoints/distilgpt2_1024_32.pt"))

List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]
Model vocab resize: 50269
Model eos token: 50256
Model pad token: 50257
Model sep token: 50258


<All keys matched successfully>

In [None]:
model.eval()
model.to(CONFIG.train_params.device)

n_samples = 1024
split = "train"
# split = "validation"

data = get_data(split)[:n_samples]
dataset = SBICDataset(data, tokenizer, is_training=False)
dataloader = make_dataloader(dataset, model, tokenizer, config, split=split)

val_f1 = []
with torch.no_grad():
    for n_iers, bacth in enumerate(tqdm(dataloader, leave=False, total=len(val_dataloader))):
        labels = bacth["labels"].numpy().astype(int)

        generate_out = model.generate(inputs = bacth["input_ids"].to(CONFIG.train_params.device),
                                      max_new_tokens=50)
        generate_tokens = generate_out.cpu().numpy()
        
        class_tokens = [gen[np.where(gen == tokenizer.sep_token_id)[0][0]+1:np.where(gen == tokenizer.sep_token_id)[0][0]+5] for gen in generate_tokens] # select only 4 class tokens after 1st sep
        class_labels =[l[np.where(l == tokenizer.sep_token_id)[0][0]+1:np.where(l == tokenizer.sep_token_id)[0][1]] for l in labels] # select only 4 labels tokens after 1st sep

        batch_f1 = []
        for labels, gen_tokens in zip(class_labels, class_tokens):
            batch_f1.append(f1_score(labels, gen_tokens, average="macro"))
        
        val_f1.append(np.mean(batch_f1))

print(f"Validation F1-Score on classification task: {np.mean(val_f1)}")