In [1]:
!pip install -q optuna datasets evaluate
!pip install -U bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/231.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import shutil

from collections import defaultdict

import torch
import torch.nn as nn
import transformers
import re

import optuna
import evaluate

import pickle

from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

from datasets import Dataset
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          Trainer,
                          TrainingArguments,
                          TrainerCallback,
                          DataCollatorForMultipleChoice,
                          GenerationConfig)

os.environ['WANDB_DISABLED'] = 'true'

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# authenticating to empty trash during grid search
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

Mounted at /content/drive


In [4]:
# training data path

version = 3 # version of model to save

path_to_nlu_dir = "/content/drive/MyDrive/Master's/Second Year Grad/NLU/NLU_FinalProject/"
data_dir = path_to_nlu_dir+"Data/JSONL_Formatted/"

train_path = "RACE-H/sftc_RACE-H_v1_trn.jsonl"
val_path = "RACE-H/sftc_RACE-H_v1_dev.jsonl"

data_name = 'RACE-H'
save_dir = path_to_nlu_dir+"Results/ft_Results/"

if not os.path.exists(save_dir):
  os.mkdir(save_dir)

model_name = "openai-community/gpt2-xl"

sftc = True if 'sftc' in train_path else False

# number of layers to train (taken from the last layers)
# if none, trains all layers
NUM_TRAINING_LAYERS = 2

Load in tokenizer/model using LoRA/PEFT

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = 'left'

# setting max_new_tokens = 0
gen_config = GenerationConfig.from_pretrained(model_name)
gen_config.max_new_tokens = 0

# QUANTIZATION (4bit)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
)

def init_model():
  model = AutoModelForCausalLM.from_pretrained(model_name,
                                               #quantization_config=bnb_config,
                                               device_map=None)

  # freeze params
  for param in model.parameters():
    param.requires_grad = False

  ''' currently not used
  if NUM_TRAINING_LAYERS is not None:
    # unfreeze last layer(s)
    for block in model.transformer.h[-NUM_TRAINING_LAYERS:]:
      for param in block.parameters():
        param.requires_grad = True

    # unfreeze final layer norm + output head
    for param in model.transformer.ln_f.parameters():
      param.requires_grad = True
    for param in model.lm_head.parameters():
      param.requires_grad = True
  '''

  # PEFT model
  if 'gpt' in model_name:
    lora_modules = ['c_attn', 'c_fc', 'c_proj', 'wpe', 'wte']
  elif 'llama' in model_name:
    lora_modules = ['q_proj', 'v_proj', 'k_proj', 'o_proj']

  r = 16
  config = LoraConfig(
        r=r,
        lora_alpha=r,
        lora_dropout=0.02,
        target_modules=lora_modules,
        bias="none",
        task_type='CAUSAL_LM',
  )

  model = get_peft_model(model, config)
  model.generation_config = gen_config

  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
  total = sum(p.numel() for p in model.parameters())
  print(f"Trainable params: {trainable} / {total} ({100 * trainable / total:.2f}%)\n")

  return model.to('cuda')

data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Load and Process Training/Validation Dataset

In [6]:
def create_input(line, sys_prompt='', fs_demos=''):
  pqa = [f"{fs_demos}Q: {line['context']} {line['question']}\nA:{sys_prompt} {line[i]}"
          for i in ['answerA', 'answerB', 'answerC', 'answerD']]
  label = 'ABCD'.index(line['correct'])

  # tokenize input
  tokenized_input = tokenizer(pqa, padding='max_length', truncation=True, max_length=512)

  return {'input_ids': tokenized_input['input_ids'],
          'attention_mask': tokenized_input['attention_mask'],
          'label': label}

In [7]:
# training data
with open(data_dir + train_path, 'r') as f:
  train_data_raw = [json.loads(line) for line in f]

train_data = Dataset.from_list([create_input(train_data_raw[i]) for i in tqdm(range(919))])

# val data
with open(data_dir + val_path, 'r') as f:
  val_data_raw = [json.loads(line) for line in f]

val_data = Dataset.from_list([create_input(val_data_raw[i]) for i in tqdm(range(131))])

print(f'\nTraining Size = {len(train_data)}')
print(f'Validation Size = {len(val_data)}')

100%|██████████| 919/919 [00:02<00:00, 397.94it/s]
100%|██████████| 131/131 [00:00<00:00, 435.08it/s]



Training Size = 919
Validation Size = 131


In [8]:
# training example
print(train_data_raw[0])
print(np.array(train_data[0]['input_ids']).shape)
print(np.array(train_data[0]['attention_mask']).shape)
print(train_data[0]['label'])

{'context': 'You may have heard the term " the American Dream " . In 1848 , James W. Marshall found [[HL]] gold [[/HL]] in California and people began having golden dreams . That 19th century " American Dream " motivated    the [[HL]] Gold [[/HL]] Rush and [[HL]] gave [[/HL]] California its nickname of the " Golden State " . The American Dream drove not only 1800s [[HL]] gold [[/HL]] - rush prospectors but also waves of immigrants throughout that century and the next . People from Europe , and a large number of [[HL]] Chinese [[/HL]] , arrived in the US in the 19th century hoping that in America they would find [[HL]] gold [[/HL]] in the streets . But most , instead , worked as railroad labourers . They [[HL]] created [[/HL]] the [[HL]] oldest [[/HL]] [[HL]] Chinatown [[/HL]] , in [[HL]] San [[/HL]] [[HL]] Francisco [[/HL]] , and [[HL]] gave [[/HL]] the [[HL]] city [[/HL]] a [[HL]] Chinese [[/HL]] [[HL]] name [[/HL]] " the [[HL]] old [[/HL]] [[HL]] gold [[/HL]] [[HL]] hill [[/HL]] " . 

Set up `Trainer` class and hyperparameter tuning

In [9]:
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id,
                              reduction="none")
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  logits, labels, input = eval_pred

  with torch.no_grad():
    if isinstance(logits, np.ndarray):
      logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
      labels = torch.tensor(labels)
    if isinstance(input, np.ndarray):
      input = torch.tensor(input)

    batch_size, num_choices, seq_len = input.shape
    input_ids_flat = input.reshape(batch_size*num_choices, -1)
    logits = logits.reshape(batch_size*num_choices, seq_len, -1)

    # Shift logits and target
    logits = logits[:, :-1, :]         # shape: (batch_size * 4, seq_len-1, vocab)
    targets = input_ids_flat[:, 1:]    # shape: (batch_size * 4, seq_len-1)

    # per choice loss
    per_token_loss = loss_fn(logits.reshape(-1, logits.size(-1)), # shape: (batch_size * 4 * seq_len-1, vocab)
                             targets.reshape(-1))                 # shape: (batch_size * 4 * seq_len-1)
    per_token_loss = per_token_loss.reshape(batch_size, num_choices, -1)  # shape: (batch_size, 4, seq_len-1)

    per_choice_loss = per_token_loss.sum(dim=-1) # shape: (batch_size, 4)

    preds = torch.argmin(per_choice_loss, dim=-1)

  acc = accuracy.compute(predictions=preds, references=labels)

  return acc

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir=f"{save_dir}checkpoints/",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True,
    eval_accumulation_steps=1,
    eval_strategy='epoch',
    save_total_limit=1,
    logging_strategy='epoch',
    include_for_metrics = ["inputs"],
    prediction_loss_only=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
class CustomTrainer(Trainer):
  def compute_loss(self, model, inputs,
                   return_outputs=False,
                   num_items_in_batch=None):
    input_ids = inputs["input_ids"]         # shape: (batch_size, 4, seq_len)
    attention_mask = inputs["attention_mask"]
    labels = inputs["labels"]               # shape: (batch_size,)

    batch_size, num_choices, seq_len = input_ids.shape
    input_ids = input_ids.reshape(batch_size * num_choices, seq_len)
    attention_mask = attention_mask.reshape(batch_size * num_choices, seq_len)

    # flatten to single batch dimension
    input_ids_flat = input_ids.reshape(batch_size*num_choices, -1)
    attention_mask_flat = attention_mask.reshape(batch_size*num_choices, -1)

    # Forward pass
    outputs = model(input_ids=input_ids_flat, attention_mask=attention_mask_flat)
    logits = outputs.logits  # shape: (batch_size * 4, seq_len, vocab)

    # Shift logits and target
    logits = logits[:, :-1, :]         # shape: (batch_size * 4, seq_len-1, vocab)
    targets = input_ids_flat[:, 1:]    # shape: (batch_size * 4, seq_len-1)

    # per choice loss
    per_token_loss = loss_fn(logits.reshape(-1, logits.size(-1)), # shape: (batch_size * 4 * seq_len-1, vocab)
                             targets.reshape(-1))                 # shape: (batch_size * 4 * seq_len-1)
    per_token_loss = per_token_loss.reshape(batch_size, num_choices, -1)  # shape: (batch_size, 4, seq_len-1)

    per_choice_loss = per_token_loss.sum(dim=-1) # shape: (batch_size, 4)

    # overall loss (smaller loss is better)
    total_loss_fn = nn.CrossEntropyLoss() # applies mean reduction by default (across batch)
    final_loss = total_loss_fn(-per_choice_loss, labels)

    return (final_loss, outputs) if return_outputs else final_loss


trainer = CustomTrainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init = init_model
)

  trainer = CustomTrainer(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]



Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
test = trainer.predict(val_data.select(range(2)))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
trainer.evaluate()

{'eval_loss': 3.4317562580108643,
 'eval_model_preparation_time': 0.0251,
 'eval_runtime': 17.942,
 'eval_samples_per_second': 7.301,
 'eval_steps_per_second': 7.301}

In [14]:
# optuna callback
best_score = float('inf')
best_run_dir = None
def OptunaCallback(study, trial):
  global best_score, best_run_dir

  curr_score = trial.value
  curr_run_dir = f"{save_dir}checkpoints/run-{trial.number}"

  if not np.isnan(curr_score) and curr_score < best_score:
    if best_run_dir is not None and os.path.exists(best_run_dir):
      print(f'Trial {trial.number} is new best. Deleting previous best at {best_run_dir}')
      shutil.rmtree(best_run_dir)

    best_score = curr_score
    best_run_dir = curr_run_dir
  elif not np.isnan(curr_score):
    if os.path.exists(curr_run_dir):
      print(f'Trial {trial.number} not best. Deleting...')
      shutil.rmtree(curr_run_dir)

  # empty trash bin
  drive_service.files().emptyTrash().execute()

def objective(trial):
  training_args = TrainingArguments(
    output_dir=f"{save_dir}checkpoints/run-{trial.number}",
    learning_rate = trial.suggest_categorical("learning_rate", [1e-3, 1e-4, 5e-5]),
    num_train_epochs = trial.suggest_categorical("num_train_epochs", [2, 3, 5]),
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True,
    eval_accumulation_steps=1,
    eval_strategy='epoch',
    save_total_limit=1,
    logging_strategy='epoch',
    include_for_metrics = ["inputs"],
    prediction_loss_only=True
  )

  trainer = CustomTrainer(
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init = init_model
  )

  trainer.train()
  return trainer.evaluate()['eval_loss']

Train and save model

In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# Train and save the best hyperparameters

search_space = {'learning_rate': [1e-3, 1e-4, 5e-5],
                    'num_train_epochs': [2,3,5]}

study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.GridSampler(search_space),
    load_if_exists=True
)

study.optimize(objective, n_trials=9, callbacks=[OptunaCallback])

[I 2025-04-20 22:12:35,355] A new study created in memory with name: no-name-1dbfea8f-c088-466e-ac28-4578fb22f0e6
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,7.8302,1.85753
2,3.7727,0.570348
3,2.2265,0.485175


[I 2025-04-20 22:29:30,778] Trial 0 finished with value: 0.4851752817630768 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 3}. Best is trial 0 with value: 0.4851752817630768.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,63.1361,17.31967
2,26.4349,5.742028
3,7.7649,5.369783
4,0.0,5.369783
5,0.0,5.369783


[I 2025-04-20 22:57:15,859] Trial 1 finished with value: 5.369783401489258 and parameters: {'learning_rate': 0.001, 'num_train_epochs': 5}. Best is trial 0 with value: 0.4851752817630768.


Trial 1 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,13.6206,4.630993
2,7.1304,4.555271
3,6.298,4.96511


[I 2025-04-20 23:14:16,556] Trial 2 finished with value: 4.9651103019714355 and parameters: {'learning_rate': 0.001, 'num_train_epochs': 3}. Best is trial 0 with value: 0.4851752817630768.


Trial 2 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,6.4751,0.623745
2,1.9679,0.332655
3,0.7751,0.531771


[I 2025-04-20 23:31:15,419] Trial 3 finished with value: 0.5317707061767578 and parameters: {'learning_rate': 0.0001, 'num_train_epochs': 3}. Best is trial 0 with value: 0.4851752817630768.


Trial 3 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,7.7096,1.691523
2,3.5479,0.524607
3,1.4633,0.298625
4,0.9064,0.287697
5,0.6923,0.309282


[I 2025-04-20 23:59:22,471] Trial 4 finished with value: 0.30928248167037964 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 5}. Best is trial 4 with value: 0.30928248167037964.


Trial 4 is new best. Deleting previous best at /content/drive/MyDrive/Master's/Second Year Grad/NLU/NLU_FinalProject/Results/ft_Results/checkpoints/run-0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,7.9801,1.97658
2,4.5046,1.142118


[I 2025-04-21 00:10:55,299] Trial 5 finished with value: 1.1421178579330444 and parameters: {'learning_rate': 5e-05, 'num_train_epochs': 2}. Best is trial 4 with value: 0.30928248167037964.


Trial 5 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,6.6918,0.899303
2,2.4929,0.463933


[I 2025-04-21 00:22:24,990] Trial 6 finished with value: 0.4639325439929962 and parameters: {'learning_rate': 0.0001, 'num_train_epochs': 2}. Best is trial 4 with value: 0.30928248167037964.


Trial 6 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,22.0251,9.710381
2,5.8801,3.312644


[I 2025-04-21 00:33:54,675] Trial 7 finished with value: 3.3126440048217773 and parameters: {'learning_rate': 0.001, 'num_train_epochs': 2}. Best is trial 4 with value: 0.30928248167037964.


Trial 7 not best. Deleting...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Trainable params: 20532496 / 1578143696 (1.30%)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainable params: 20532496 / 1578143696 (1.30%)



Epoch,Training Loss,Validation Loss
1,6.4186,0.961039
2,2.2308,0.458596
3,0.6967,0.584465
4,0.5012,0.330689
5,0.2599,0.334988




[I 2025-04-21 01:01:59,963] Trial 8 finished with value: 0.3349880278110504 and parameters: {'learning_rate': 0.0001, 'num_train_epochs': 5}. Best is trial 4 with value: 0.30928248167037964.


Trial 8 not best. Deleting...


In [17]:
N = len(train_data)*study.best_trial.params['num_train_epochs']
checkpoint_dir = os.path.join(save_dir, 'checkpoints', f"run-{study.best_trial.number}", f"checkpoint-{N}")

print(checkpoint_dir)

model_dir = f"Salm00n/{model_name.split('/')[-1]}_{data_name}_v{version}"
print(model_dir)

os.path.exists(checkpoint_dir)

/content/drive/MyDrive/Master's/Second Year Grad/NLU/NLU_FinalProject/Results/ft_Results/checkpoints/run-4/checkpoint-4595
Salm00n/gpt2-xl_RACE-H_v3


True

In [None]:
# Save best model to HuggingFace account

if not sftc:
  # load in best model with same PEFT/quantization
  # Note: uses last saved checkpoint from best parameter trial
  config = PeftConfig.from_pretrained(checkpoint_dir)
  base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                              #quantization_config=bnb_config,
                                              device_map='auto')
  model = PeftModel.from_pretrained(base_model, checkpoint_dir, torch_dtype=torch.float16)

  # push best model to hugging face
  model.push_to_hub(model_dir)

else:
  config = PeftConfig.from_pretrained(checkpoint_dir)
  base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                              #quantization_config=bnb_config,
                                              device_map='auto')
  model = PeftModel.from_pretrained(base_model, checkpoint_dir, torch_dtype=torch.float16,
                                    is_trainable=True)

  # load in second dataset

  # training data
  train_path_orig = data_dir + train_path.replace('sftc_','')
  print(f'Second Training Set at {train_path_orig}')
  with open(train_path_orig, 'r') as f:
    train_data_raw = [json.loads(line) for line in f]

  train_data = Dataset.from_list([create_input(train_data_raw[i]) for i in tqdm(range(919))])

  # val data
  val_path_orig = data_dir + val_path.replace('sftc_','')
  print(f'Second Validation Set at {val_path_orig}')
  with open(val_path_orig, 'r') as f:
    val_data_raw = [json.loads(line) for line in f]

  val_data = Dataset.from_list([create_input(val_data_raw[i]) for i in tqdm(range(131))])

  training_args = TrainingArguments(
    output_dir = model_dir,
    learning_rate = study.best_trial.params['learning_rate'],
    num_train_epochs = study.best_trial.params['num_train_epochs'],
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True,
    eval_accumulation_steps=1,
    eval_strategy='epoch',
    save_total_limit=1,
    logging_strategy='epoch',
    include_for_metrics = ["inputs"],
    prediction_loss_only=True
  )

  trainer = CustomTrainer(
    model = model,
    tokenizer = tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  # push best model to hugging face
  trainer.push_to_hub(model_dir)

In [None]:
from google.colab import runtime
runtime.unassign()