In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import wandb

import torch
from transformers import set_seed

from src.config import CONFIG
from src.dataset import SBICDataset
from src.utils import PropertyDict
from src.train_utils import *

# wandb.login()

2023-10-10 11:26:23.538671: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
hp = PropertyDict(
    seed=42,
    checkpoint_name="distilgpt2",
    model_name="distilgpt2",
    padding_side="left",
    batch_size=16,
    val_batch_size=16,
    num_workers=0,
    num_epochs=5,
    learning_rate=5e-4,
    scheduler="linear",
    warmup_fraction=0.1,
    accumulation_steps=1,
    gradient_clip = 1.0,
    mixed_precision="fp16",
    checkpoint_interval=1000,
    log_interval=600,
    cpu=False
)

## Train

In [17]:
set_seed(CONFIG.seed)

# Make the model
tokenizer = make_tokinzer(hp)
model = make_model(hp, tokenizer)

# Make the data
train_data = get_data("train")[:8192]
# train_data = pd.read_pickle(CONFIG.dataset.train_data_preproc).to_numpy()[:1024]
train_dataset = SBICDataset(train_data, tokenizer)

val_data = get_data("validation")[:2048]
val_dataset = SBICDataset(train_data, tokenizer)

train_dataloader = make_dataloader(train_dataset, model, tokenizer, hp, split="train")
val_dataloader = make_dataloader(val_dataset, model, tokenizer, hp, split="validation")

# Make the loss, the optimizer and the scheduler
optimizer = make_optimizer(model, hp)
scheduler = make_scheduler(
    optimizer, steps_per_epoch=len(train_dataloader), config=hp
)

# model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, metrics = make(config)
# print(model)

train(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    scheduler,
    hp,
    monitor=False
)

List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]


In [None]:
torch.save(model.state_dict(), "checkpoints/distilgpt2_1024_32_preproc.pt")

gc.collect()
torch.cuda.empty_cache()

In [None]:
with wandb.init(project=CONFIG.wandbConfig.project, config=hp):
    config = wandb.config

    set_seed(CONFIG.seed)

    # Make the model
    tokenizer = make_tokinzer(config)
    model = make_model(config, tokenizer)

    # Make the data
    train_data = get_data("train")[:1024]
    train_dataset = SBICDataset(train_data, tokenizer)

    val_data = get_data("validation")[:1024]
    val_dataset = SBICDataset(train_data, tokenizer)

    train_dataloader = make_dataloader(train_dataset, model, tokenizer, config, split="train")
    val_dataloader = make_dataloader(val_dataset, model, tokenizer, config, split="validation")

    # Make the loss, the optimizer and the scheduler
    optimizer = make_optimizer(model, config)
    scheduler = make_scheduler(
        optimizer, steps_per_epoch=len(train_dataloader), config=config
    )

    # model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, metrics = make(config)
    # print(model)

    train(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        config,
    )

    # results = evaluate(model, tokenizer, train_data, val_data, config)
    torch.save(model.state_dict(), "checkpoints/distilgpt2_1024_32_preproc.pt")

gc.collect()
torch.cuda.empty_cache()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmatteo-periani[0m. Use [1m`wandb login --relogin`[0m to force relogin


List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]




VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

KeyboardInterrupt: 

## Eval

In [4]:
tokenizer = make_tokinzer(hp)
model = make_model(hp, tokenizer)
model.load_state_dict(torch.load("checkpoints/distilgpt2_8192_16.pt"))
model.eval()
model.to(CONFIG.train_params.device)

List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]
Model vocab resize: 50269
Model eos token: 50256
Model pad token: 50257
Model sep token: 50258


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50269, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50269, bias=False)
)

In [20]:
from rouge import Rouge 

# model.eval()
n_samples = 1024
# split = "train"
split = "validation"

data = get_data(split)[:n_samples]
dataset = SBICDataset(data, tokenizer, is_training=False)
dataloader = make_dataloader(dataset, model, tokenizer, hp, split=split)


f1_classifiaction = []
rougel_f1_minority = []
rougel_f1_stereotype = []

rouge = Rouge()
with torch.no_grad():
    for n_iers, bacth in enumerate(tqdm(dataloader, leave=False, total=len(dataloader))):
        class_labels = bacth["class_labels"]
        minority_labels = tokenizer.batch_decode(bacth["minority_labels"])
        stereotype_labels = tokenizer.batch_decode(bacth["stereotype_labels"])

        inputs_ids = torch.tensor(bacth["input_ids"]).to(CONFIG.train_params.device)
        generate_out = model.generate(inputs = inputs_ids,
                                      max_new_tokens=1000)
        # select from generated output, only generated tokens (from 1st sep token to eos)
        predictions = [gen[np.where(gen == tokenizer.sep_token_id)[0][0]+1:np.where(gen == tokenizer.eos_token_id)[0][0]+1] for gen in generate_out.cpu().numpy()]
        
        class_preds = []
        minority_preds = []
        stereotype_preds = []

        for pred in predictions:
            sep_idx = np.where(pred == tokenizer.sep_token_id)[0]
            
            if len(sep_idx) > 0:
                class_preds.append(np.concatenate((pred[:sep_idx[0]], [pred[-2]])))
                
                if len(sep_idx) >= 2:
                    minority_preds.append(pred[sep_idx[0]+1:sep_idx[1]])

                    if len(sep_idx) >= 3:
                        stereotype_preds.append(pred[sep_idx[1]+1:sep_idx[2]])
                    else:
                        stereotype_preds.append([])
                else:
                    minority_preds.append([])

        batch_f1_class = []
        for labels, gen_tokens in zip(class_labels, class_preds):
            try:
                good_idx = [idx for idx,lbl in enumerate(labels) if lbl != tokenizer.pad_token_id]
                batch_f1_class.append(f1_score([labels[index] for index in good_idx],
                                            [gen_tokens[index] for index in good_idx],
                                            average="macro"))
            except:
                print("true:", tokenizer.decode(labels, skip_special_tokens=False))
                print("pred:", tokenizer.decode(gen_tokens, skip_special_tokens=False))
                raise

        
        minority_preds = tokenizer.batch_decode(minority_preds)
        stereotype_preds = tokenizer.batch_decode(stereotype_preds)

        batch_r_score_min = []
        for labels, preds in zip(minority_labels, minority_preds):
            if labels != '':
                if preds != '':
                    r_score = rouge.get_scores(preds, labels)[0]["rouge-l"]["f"]
                    batch_r_score_min.append(np.nan_to_num(r_score))
                else:
                    batch_r_score_min.append(0.)
        
        batch_r_score_strtp = []
        for labels, preds in zip(stereotype_labels, stereotype_preds):
            if labels != '':
                if preds != '':
                    r_score = rouge.get_scores(preds, labels)[0]["rouge-l"]["f"]
                    batch_r_score_strtp.append(np.nan_to_num(r_score))
                else:
                    batch_r_score_min.append(0.)

        
        f1_classifiaction.append(np.mean(batch_f1_class))
        if len(batch_r_score_min) >0:
            rougel_f1_minority.append(np.mean(batch_r_score_min))
        else:
            rougel_f1_minority.append(0.)
        if len(batch_r_score_strtp) > 0:
            rougel_f1_stereotype.append(np.mean(batch_r_score_strtp))
        else:
            rougel_f1_minority.append(0.)

print(f"Classification F1 on {split} set: {np.mean(f1_classifiaction):.3f}")
print(f"Minority RougeL-f1 on {split} set: {np.mean(rougel_f1_minority):.3f}")
print(f"Stereotype RougeL-f1 on {split} set: {np.mean(rougel_f1_stereotype):.3f}")

                                               

Classification F1 on validation set: 0.628
Minority RougeL-f1 on validation set: 0.598
Stereotype RougeL-f1 on validation set: 0.348


