## Imports

In [1]:
# gemma-2 is available from transformers>=4.42.3

import transformers as trsf
print("Transformers:", trsf.__version__)

#!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Transformers: 4.47.1


In [2]:
import os
import copy
from dataclasses import dataclass
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    Gemma2Model,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModel,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils
import Configurations as Configs
#import wsdm_modelutils as Utils

In [3]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [4]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


## Config

In [5]:
config_file = 'Configs.py'
manager = Configs.ConfigManager(config_file)

config = manager.micro
#config = manager.runpod_1
#load_from_config = manager.save_load_gemma2_2b_fp16
#load_from_config = manager.save_load_gemma2_2b_fp16_hidden_512
config.config_name

'micro_gemma2_2b_fp16_4bit'

### Paths

In [6]:
# for AutoModel and auto tokenizer
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16'
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16-4bit'
base_model_path = config.basemodel_path


# peft pre saved locally
#peft_model_path = '/kaggle/input/peftchkpt_original/pytorch/default/1/'
peft_model_path = '../Checkpoints/'
checkpoint_name = "Original_notrain"

#df = pd.read_csv('/kaggle/input/wsdm-preprocessed-full-original/train_preprocessed_FULL_original.csv')
#dataframe_path = '/kaggle/input/train-preprocessed-mini-original/train_preprocessed_MINI_original.csv'
dataframe_path = config.train_data


## Trainning Args

In [7]:
#training_args = TrainingArguments(
#    output_dir="output",
#    overwrite_output_dir=True,
#    report_to="none",
#    num_train_epochs=config.n_epochs,
#    per_device_train_batch_size=config.train_batch,
#    gradient_accumulation_steps=config.gradient_accumulation_steps,
#    per_device_eval_batch_size=config.eval_batch,
#    logging_steps=10,
#    eval_strategy="epoch",
#    save_strategy="steps",
#    save_steps=200,
#    optim=config.optim_type,
#    fp16=True,
#    learning_rate=config.lr,
#    warmup_steps=config.warmup_steps,
#)

## LoRA Config

In [8]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
)

___________________________________________________________________________

## Data

In [9]:
df = pd.read_csv(dataframe_path)
df.head(1)

  df = pd.read_csv(dataframe_path)


Unnamed: 0,id,prompt,response_a,response_b,winner,language,class_label,prompt_len,response_a_len,response_b_len,...,prompt_chinese,prompt_round_balance,prompt_curly_balance,prompt_json,prompt_sentiment,response_a_sentiment,response_b_sentiment,cosine_similarity_a,cosine_similarity_b,cosine_similarity_diff
0,53567,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,model_b,,1,192,3096,3592,...,0.0,0,0,0,0.077778,0.058469,0.139458,0.705321,0.629803,0.075518


In [10]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Tokenize

In [11]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [12]:
df = df.sample(frac=config.sample_size, random_state=config.random_seed)

In [22]:
def tokenize_df(row):
    return Utils.tokenize(tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=config.max_length)

df['tokens'] = df.apply(tokenize_df, axis=1)

In [23]:
df.shape

(175, 71)

## Split

In [24]:
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=config.random_seed)

## Data Loader

In [25]:
# Prepare dataset and dataloader
dataset_train = Utils.ChatbotArenaDataset(df_train, tokenizer, max_length=2048)
dataloader_train = Utils.DataLoader(dataset_train, batch_size=config.train_batch, shuffle=True)

dataset_valid = Utils.ChatbotArenaDataset(df_valid, tokenizer, max_length=2048)
dataloader_valid = Utils.DataLoader(dataset_valid, batch_size=config.eval_batch, shuffle=True)

In [26]:
len(dataloader_train)

79

## Model

In [27]:
predictionModel = Utils.custom_load_model_chkpt(
                        config,
                        checkpointName="Original_notrain",
                        device=device
                        )

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [28]:
predictionModel

PreferencePredictionModel(
  (gemma_model): PeftModel(
    (base_model): LoraModel(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
     

In [29]:
optimizer = optim.AdamW([
    {'params': predictionModel.gemma_model.parameters(), 'lr': 2e-6},     # Lower learning rate for transformer layers
    {'params': predictionModel.classifier.parameters(), 'lr': 1e-3},      # Higher learning rate for custom layers
], weight_decay=0.01)

#optimizer = optim.AdamW(weight_decay=0.01)
#optimizer = optim.Adam(predictionModel.parameters())

## Train

In [30]:
batch = next(iter(dataloader_train))


In [31]:
feats = batch['features'].to(device)
feats

tensor([[ 3.3400e+02,  2.7320e+03,  9.7900e+02,  6.3000e+01,  5.0000e+02,
          2.2000e+02,  8.0000e+00,  4.5000e+01,  1.8000e+01,  1.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  5.0000e+00,
          0.0000e+00,  3.0000e+00,  0.0000e+00,  5.0000e+00,  1.6000e+01,
          9.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  2.0000e+00,  3.0000e+00,  6.0000e+00,
          2.0000e+00,  3.0000e+00,  1.2000e+01,  8.3000e+01,  6.1900e+02,
          3.0200e+02,  1.1000e+01,  3.9000e+01,  5.3000e+01,  7.0359e-01,
          7.3792e-01,  5.9142e-01,  1.4970e-02,  2.0132e-02,  2.3493e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         -6.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -3.7500e-02,  5.6452e-02, -1.5357e-01,
          5.7135e-01,  7.3935e-01, -1.6800e-01],
        [ 1.2300e+02,  2.0190e+03,  5.4190e+03,  2.2000e+01,  3

In [32]:
labels = batch['label'].to(device)
labels

tensor([[1.],
        [0.]], device='cuda:0')

In [33]:
labels[:, -1]

tensor([1., 0.], device='cuda:0')

In [34]:
predictionModel.to(device)
predictionModel.eval()
with torch.no_grad():
    logits = predictionModel(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    features=batch['features'].to(device)
                )

The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


In [35]:
logits

tensor([[0.4768],
        [0.4752]], device='cuda:0')

In [36]:
logits[:, -1]

tensor([0.4768, 0.4752], device='cuda:0')

In [37]:
normLogits = (logits>0.5).float()
normLogits

tensor([[0.],
        [0.]], device='cuda:0')

In [38]:
predictions = normLogits
true_labels = labels

In [39]:
labels.size(0)

2

In [40]:
predictions

tensor([[0.],
        [0.]], device='cuda:0')

In [41]:
true_labels

tensor([[1.],
        [0.]], device='cuda:0')

In [42]:
(predictions == true_labels).sum().item() / labels.size(0)

0.5

In [43]:
loss = nn.BCELoss()(logits, labels)
loss

tensor(0.6927, device='cuda:0')

In [44]:
predictionModel.gemma_model.config.hidden_size

2304

In [45]:
tokenizer.decode(batch['input_ids'][0])

'<bos><prompt>: A long cylindrical conductor (radius = 1.0 mm) carries a charge density of +8 pC/m and is inside a coaxial, hollow, cylindrical conductor (inner radius = 3.0 mm, outer radius = 5.0 mm) that has a total charge density of -2 pC/m.\n\nWhat is the magnitude of the electric field 2.0 mm from the axis of these conductors?\nAnswer for part 1\n\n\n<response_a>: ## Step 1:  Identify the type of problem and relevant equations.\nThis problem involves finding the electric field outside a long cylindrical conductor due to a charge distribution. We will use the method of images to solve this. The electric field due to a long cylindrical conductor is given by the Biot-Savart law and is proportional to 1/r. For a coaxial conductor, we also need to consider the field due to the inner conductor.\n\n## Step 2:  Determine the charge distribution and the distance to calculate the electric field.\nThe long cylindrical conductor has a charge density of +8 pC/m, and the coaxial conductor has a

In [46]:
stop

NameError: name 'stop' is not defined

In [None]:
history = Utils.train_model(predictionModel, dataloader_train, dataloader_valid, optimizer, config, scheduler=None, device=device)

In [None]:
Utils.plot_model_history(history, "Trainning History")

In [None]:
#!runpodctl remove pod $RUNPOD_POD_ID