## Imports

In [1]:
# gemma-2 is available from transformers>=4.42.3

import transformers as trsf
print("Transformers:", trsf.__version__)

#!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Transformers: 4.46.3


In [2]:
import os
import copy
from dataclasses import dataclass
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    Gemma2Model,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModel,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils
import Configurations as Configs
#import wsdm_modelutils as Utils

In [4]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [5]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


## Config

In [6]:
config_file = 'Configs.py'
manager = Configs.ConfigManager(config_file)

config = manager.micro_test
#config = manager.runpod_1
#load_from_config = manager.save_load_gemma2_2b_fp16
load_from_config = manager.save_load_gemma2_2b_fp16_hidden_512
config.config_name

'micro_gemma2_2b_fp16'

### Paths

In [7]:
# for AutoModel and auto tokenizer
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16'
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16-4bit'
base_model_path = config.basemodel_path


# peft pre saved locally
#peft_model_path = '/kaggle/input/peftchkpt_original/pytorch/default/1/'
peft_model_path = '../Checkpoints/'
checkpoint_name = "Original_notrain"

#df = pd.read_csv('/kaggle/input/wsdm-preprocessed-full-original/train_preprocessed_FULL_original.csv')
#dataframe_path = '/kaggle/input/train-preprocessed-mini-original/train_preprocessed_MINI_original.csv'
dataframe_path = config.train_data


## Trainning Args

In [8]:
#training_args = TrainingArguments(
#    output_dir="output",
#    overwrite_output_dir=True,
#    report_to="none",
#    num_train_epochs=config.n_epochs,
#    per_device_train_batch_size=config.train_batch,
#    gradient_accumulation_steps=config.gradient_accumulation_steps,
#    per_device_eval_batch_size=config.eval_batch,
#    logging_steps=10,
#    eval_strategy="epoch",
#    save_strategy="steps",
#    save_steps=200,
#    optim=config.optim_type,
#    fp16=True,
#    learning_rate=config.lr,
#    warmup_steps=config.warmup_steps,
#)

## LoRA Config

In [9]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
)

___________________________________________________________________________

## Data

In [10]:
df = pd.read_csv(dataframe_path)
df.head(1)

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,encode_fail,class_label,prompt_len,response_a_len,response_b_len
0,00072026c68f5418ef2da238394e418ce72a534b9b22d5...,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,gemma-2-2b-it,llama-3.1-nemotron-70b-instruct,English,False,1,99,477,1799


In [11]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Tokenize

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [13]:
df = df.sample(frac=config.sample_size, random_state=config.random_seed)

In [14]:
df.shape

(2521, 13)

## Split

In [15]:
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=config.random_seed)

## Data Loader

In [16]:
# Prepare dataset and dataloader
dataset_train = Utils.ChatbotArenaDataset(df_train, tokenizer, max_length=config.max_length)
dataloader_train = Utils.DataLoader(dataset_train, batch_size=config.train_batch, shuffle=True)

dataset_valid = Utils.ChatbotArenaDataset(df_valid, tokenizer, max_length=config.max_length)
dataloader_valid = Utils.DataLoader(dataset_valid, batch_size=config.eval_batch, shuffle=True)

In [17]:
dataset_train[0]

{'input_ids': tensor([     2, 235322,  39038,  78880,   1212, 235248, 235310,  32741,  33334,
           1134,    614,    573,   1546,  14242,    774,   1853,   1156, 235269,
           1693,   2183,  64199,    611,    476,  17109,  10198,    109, 235322,
           4250, 235298, 235250,  78880,   3194,  10241,    476,  17109,  10198,
            575,   4373,  28826,   1570,   3782,   1249,  40768, 235269,  22623,
          14242,  33334,    603,  20305,    604,  26936,   1853,   9701,    689,
           2778,    603,   6645, 135823, 235265,   5698,    708,   2785,  33334,
            674,    708,   7475,  14242,    774,    974,   2550, 235292,    109,
         235274, 235265,   5231,  25956,  83747, 235303, 235253, 235303,  28581,
            688,    108, 235284, 235265,   5231,  32158,  83747, 235303, 235256,
         235303,  28581,    688,    108, 235304, 235265,   5231,  58367,   3930,
          83747,  58902, 235303,  28581,    688,    108, 235310, 235265,   5231,
          59737

## Model

In [18]:
predictionModel = Utils.custom_load_model_chkpt(
                        config,
                        loadFrom=load_from_config,
                        checkpointName="Original_notrain",
                        device=device
                        )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
predictionModel

PreferencePredictionModel(
  (gemma_model): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
 

In [20]:
optimizer = optim.AdamW([
    {'params': predictionModel.gemma_model.parameters(), 'lr': 2e-6},     # Lower learning rate for transformer layers
    {'params': predictionModel.classifier.parameters(), 'lr': 1e-3},      # Higher learning rate for custom layers
], weight_decay=0.01)

#optimizer = optim.AdamW(weight_decay=0.01)
#optimizer = optim.Adam(predictionModel.parameters())

## Train

In [21]:
batch = next(iter(dataloader_train))


In [22]:
labels = batch['label'].to(device)
labels

tensor([[0.],
        [0.]], device='cuda:0')

In [23]:
labels[:, -1]

tensor([0., 0.], device='cuda:0')

In [24]:
predictionModel.to(device)
predictionModel.eval()
with torch.no_grad():
    logits = predictionModel(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    #features=batch['features'].to(device)
                )

KeyboardInterrupt: 

In [91]:
logits

tensor([[0.9129],
        [0.8722]], device='cuda:0')

In [92]:
logits[:, -1]

tensor([0.9129, 0.8722], device='cuda:0')

In [93]:
normLogits = (logits>0.5).float()
normLogits

tensor([[1.],
        [1.]], device='cuda:0')

In [94]:
predictions = normLogits
true_labels = labels

In [102]:
labels.size(0)

2

In [95]:
predictions

tensor([[1.],
        [1.]], device='cuda:0')

In [96]:
true_labels

tensor([[1.],
        [0.]], device='cuda:0')

In [103]:
(predictions == true_labels).sum().item() / labels.size(0)

0.5

In [98]:
loss = nn.BCELoss()(logits, labels)
loss

tensor(1.0743, device='cuda:0')

In [99]:
predictionModel.gemma_model.config.hidden_size

2304

In [100]:
stop

NameError: name 'stop' is not defined

In [None]:
history = Utils.train_model(predictionModel, dataloader_train, dataloader_valid, optimizer, config, scheduler=None, device=device)

In [None]:
Utils.plot_model_history(history, "Trainning History")

In [None]:
#!runpodctl remove pod $RUNPOD_POD_ID