# GPT2 Seq2Seq training with Wandb

This is just to practice on setting up training script with programmer's best practices - the main catalyst behind my OCD.



In [None]:
# project configuration for wandb: https://docs.wandb.ai/tutorials/pytorch/

import wandb
from datetime import datetime

project_name = "gpt2seqPref"
project_run_name = project_name + "-" + datetime.now().strftime("%d-%m_%H:%M")

wandb.init(
    project=project_name,
    id=project_run_name,
    notes="Experimenting with SEQ2SEQ language classification task. Useful for setting priors for Agents upon their observations."
)

  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011132536111108492, max=1.0…

KeyboardInterrupt: 

In [None]:
# Loads the file
import pandas as pd
from pathlib import Path

kaggle_data_name = 'wsdm-cup-multilingual-chatbot-arena'
sample_subset_size = 200
seed = 42

try:
    # Loads the dataset from path
    data_path = {}
    for folder in Path(kaggle_data_name).iterdir():
        data_path[folder.stem] = folder.resolve()

    if not data_path:
        raise OSError(f"Loading File Error data_path is empty. Data: {data_path}")
    else:
        OUTPUT_PATH = Path("output").resolve()
        OUTPUT_PATH.mkdir(exist_ok=True)
        print('All data loaded: ', data_path, '\nOutput path: ', OUTPUT_PATH)

        # loads training dataset
        ds = pd.read_parquet(data_path['train'])
        ds = ds.sample(n=sample_subset_size, random_state=seed)
        ds.reset_index(drop=True, inplace=True)

        # loads submission test dataset
        submission_ds = pd.read_parquet(data_path['test'])

except Exception as e:
    print(e)

In [None]:
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

from tqdm.notebook import tqdm

In [23]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [63]:
model.config.pad_token_id = model.config.eos_token_id

In [69]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [70]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [None]:
def prepareDataset(row):
    row['text'] = [row['prompt'] + row['response_a'], row['prompt'] + row['response_b']]

    if row['winner'] == 'model_a':
        labels = [[1, 0], [0, 1]]
    else:
        labels = [[0, 1], [1, 0]]

    token_ids = tokenizer(
        row['text'],
        padding='max_length',
        truncation=True,
        return_tensors='pt',
    )
    output = {"labels": labels}
    output.update(token_ids)
    return output

In [72]:
data = Dataset.from_pandas(ds)
data = data.map(prepareDataset, remove_columns=['prompt', 'response_a', 'response_b', 'model_a', 'model_b', 'language', 'language', 'winner', 'id'])
data

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [73]:
data = data.train_test_split(train_size=0.6)
train_data, eval_data = data['train'], data['test']
train_dataload = DataLoader(train_data, batch_size=16, shuffle=True)

In [None]:
class FineTuner:
    def __init__(self, gpt2, data_size: int, num_epochs: int = 3):
        self.optim_config = dict(
            lr=2e-5,
            eps=1e-8,
        )
        self.train_config = dict(
            num_epochs=num_epochs,
            total_steps=data_size * num_epochs,
            num_warmup_steps=0
        )

        self.model = gpt2
        self.optim = AdamW(self.model.parameters(), **self.optim_config)
        self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=self.train_config['num_warmup_steps'], num_training_steps=self.train_config['total_steps'])

        self.train_loss_history = []
        self.eval_loss_history = []
        self.train_acc_history = []
        self.eval_acc_history = []

    def train(self, dataloader):
        self.model.train(mode=True)
        for epoch in tqdm(range(self.train_config['num_epochs'])):
            pass

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
    )

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

# Loop through each epoch.
print('Epoch')
for epoch in tqdm(range(epochs)):
  print()
  print('Training on batches...')
  # Perform one full pass over the training set.
  train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
  train_acc = accuracy_score(train_labels, train_predict)

  # Get prediction form model on validation data.
  print('Validation on batches...')
  valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
  val_acc = accuracy_score(valid_labels, valid_predict)

  # Print loss and accuracy values to see how training evolves.
  print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
  print()

  # Store the loss value for plotting the learning curve.
  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)

# Plot loss curves.
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

# Plot accuracy curves.
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])


In [None]:
output_dir = OUTPUT_PATH / project_name
output_dir.mkdir(exist_ok=True)

num_epochs = 5
train_config = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=num_epochs,
    max_steps=sample_subset_size * num_epochs,
    learning_rate=2e-05,
    weight_decay=0.0001,
    logging_strategy="epoch",
    report_to=None,
    push_to_hub=True,
    hub_model_id=project_name,
    hub_strategy="checkpoint",
    gradient_checkpointing=True
)



In [None]:
trainer = Trainer(
    model,
    train_config,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [76]:
wandb.finish()

In [77]:
trainer.train()

  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01114626666660317, max=1.0)…

CommError: Run initialization has timed out after 90.0 sec. Please try increasing the timeout with the `init_timeout` setting: `wandb.init(settings=wandb.Settings(init_timeout=120))`.