In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import gc
from tqdm.notebook import tqdm
from collections import deque
import wandb

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import set_seed

from src.config import Config
from src.dataset import SBICDataCollator, SBICDataset
from src.utils import PropertyDict
from src.train_utils import *

CONFIG:Config = Config()
wandb.login()

2023-10-02 18:30:04.135406: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmatteo-periani[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
hyperparameters = PropertyDict(
    seed=42,
    checkpoint_name="distilgpt2",
    model_name="distilgpt2",
    padding_side="left",
    batch_size=32,
    val_batch_size=16,
    num_workers=0,
    num_epochs=20,
    learning_rate=5e-4,
    scheduler="linear",
    warmup_fraction=0.1,
    accumulation_steps=1,
    gradient_clip = 1.0,
    mixed_precision="fp16",
    checkpoint_interval=1000,
    log_interval=1000,
    cpu=False
)

In [4]:
with wandb.init(project=CONFIG.wandbConfig.project, config=hyperparameters):
    config = wandb.config

    set_seed(CONFIG.seed)

    # Make the model
    tokenizer = make_tokinzer(config)
    model = make_model(config, tokenizer)

    # Make the data
    train_data = get_data("train")[:1024]
    train_dataset = SBICDataset(train_data, tokenizer)

    val_data = get_data("validation")[:1024]
    val_dataset = SBICDataset(train_data, tokenizer)

    train_dataloader = make_dataloader(train_dataset, model, tokenizer, config, split="train")
    val_dataloader = make_dataloader(val_dataset, model, tokenizer, config, split="validation")

    # Make the loss, the optimizer and the scheduler
    optimizer = make_optimizer(model, config)
    scheduler = make_scheduler(
        optimizer, steps_per_epoch=len(train_dataloader), config=config
    )

    # model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, metrics = make(config)
    # print(model)

    train(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        scheduler,
        config,
    )

    # results = evaluate(model, tokenizer, train_data, val_data, config)

List of all special token and its token_id:
 - ['<|endoftext|>', '<|sep|>', '<|pad|>', '<|offY|>', '<|offN|>', '<|sexY|>', '<|sexN|>', '<|intY|>', '<|intN|>', '<|grpY|>', '<|grpN|>', '<|ingrpN|>', '<|ingrpY|>']
 - [[50256], [50258], [50257], [50259], [50260], [50261], [50262], [50263], [50264], [50265], [50266], [50267], [50268]]
Model vocab resize: 50269
Model eos token: 50256
Model pad token: 50257
Model sep token: 50258


  0%|          | 0/640 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 640/640 [02:55<00:00,  3.66it/s]




0,1
lr,▁▃▅▆███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
lr,1e-05
train_loss,0.09683


In [7]:
# model.load_state_dict(torch.load("checkpoints/gpt2_512_16.pt"))
model.eval()
model.to(CONFIG.train_params.device)

input_str = train_data[3][5] + tokenizer.sep_token
with torch.no_grad():
    inputs = tokenizer(
        input_str, truncation=True, padding="max_length", max_length=256, return_tensors="pt"
    )
    inputs = {k: v.to(CONFIG.train_params.device) for k, v in inputs.items()}

    forward_out = model(**inputs)
    logits = forward_out.logits[0, :-1, :]
    forward_tokens = torch.argmax(logits, axis=-1)

    generate_out = model.generate(**inputs, max_new_tokens=50)
    generate_tokens = generate_out.cpu().numpy()[0]
    
    print("Input string:", input_str)
    print("Forward output: ", tokenizer.decode(forward_tokens, skip_special_tokens=False))
    print("Generate output: ", tokenizer.decode(generate_tokens, skip_special_tokens=False))

Input string: RT @iBeZo: Stupid fucking nigger LeBron. You flopping stupid jungle bunny monkey faggot.<|sep|>
Forward output:  <|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><|intY|><



In [8]:
train_data[3]

array([1.0, 1.0, 0.0, 1.0, 0.0,
       'RT @iBeZo: Stupid fucking nigger LeBron. You flopping stupid jungle bunny monkey faggot.',
       'black folks', 'race', 'all stupid', 't/davidson'], dtype=object)