## Imports

In [1]:
# gemma-2 is available from transformers>=4.42.3

import transformers as trsf
print("Transformers:", trsf.__version__)

#!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Transformers: 4.47.1


In [2]:
import os
import copy
from dataclasses import dataclass
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    Gemma2Model,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    get_cosine_schedule_with_warmup,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils
import Configurations as Configs
#import wsdm_modelutils as Utils

In [3]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [4]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


## Config

In [5]:
config_file = 'Configs.py'
manager = Configs.ConfigManager(config_file)

config = manager.micro
#config = manager.runpod_1
#load_from_config = manager.save_load_gemma2_2b_fp16
#load_from_config = manager.save_load_gemma2_2b_fp16_hidden_512
config.config_name

'micro_gemma2_2b_fp16_4bit'

### Paths

In [6]:
base_model_path = config.basemodel_path

peft_model_path = '../Checkpoints/'
checkpoint_name = "Original_notrain"
dataframe_path = config.train_data


## LoRA Config

In [7]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS
)

# lora_config = LoraConfig(
#     r=config.lora_r,
#     lora_alpha=config.lora_alpha,
#     # only target self-attention
#     target_modules=["q_proj", "k_proj", "v_proj"],
#     #target_modules=["all-linear"],
#     #target_modules=["self_attn"],
#     layers_to_transform=[i for i in range(config.max_layers) if i >= config.freeze_layers],
#     #layers_to_transform=[0],
#     layers_pattern="layers",
#     lora_dropout=config.lora_dropout,
#     bias=config.lora_bias,
#     #task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
# )

## Data

In [8]:
df = pd.read_csv(dataframe_path)
df.head(1)

  df = pd.read_csv(dataframe_path)


Unnamed: 0,id,prompt,response_a,response_b,winner,language,class_label,prompt_len,response_a_len,response_b_len,...,prompt_chinese,prompt_round_balance,prompt_curly_balance,prompt_json,prompt_sentiment,response_a_sentiment,response_b_sentiment,cosine_similarity_a,cosine_similarity_b,cosine_similarity_diff
0,53567,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,model_b,,1,192,3096,3592,...,0.0,0,0,0,0.077778,0.058469,0.139458,0.705321,0.629803,0.075518


In [9]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Tokenize

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [11]:
df = df.sample(frac=config.sample_size, random_state=config.random_seed)

In [12]:
df.shape

(175, 70)

In [13]:
def tokenize_df(row):
    return Utils.tokenize(tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=config.max_length)

df['tokens'] = df.apply(tokenize_df, axis=1)

In [14]:
df

Unnamed: 0,id,prompt,response_a,response_b,winner,language,class_label,prompt_len,response_a_len,response_b_len,...,prompt_round_balance,prompt_curly_balance,prompt_json,prompt_sentiment,response_a_sentiment,response_b_sentiment,cosine_similarity_a,cosine_similarity_b,cosine_similarity_diff,tokens
52573,469a23f177362d53b509d6c3242f35da8d152d2d0773e1...,根据下面的描述生成一个病历：\n\n扁鹊见蔡桓公，立有间，扁鹊曰：“君有疾在腠理，不治将恐深...,**患者姓名**: 蔡桓公 \n**性别**: 男 \n**年龄**: 不详 \n**...,## 蔡桓公病历\n\n**患者姓名:** 蔡桓公\n\n**性别:** 男\n\n**年龄...,model_b,Chinese,1,289,543,711,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.122169,-0.122169,"{'input_ids': [[tensor(2), tensor(235322), ten..."
6383,695419302,Please exmplain what are the various form of t...,Transfer learning is a technique in machine le...,Transfer learning is a machine learning techni...,model_a,,0,116,2439,2637,...,0,0,0,0.000000,0.123044,0.086212,0.709314,0.726687,-0.017373,"{'input_ids': [[tensor(2), tensor(235322), ten..."
37366,4093550403,I am a physically challenged girl and an inte...,As a physically challenged and internally disp...,Title: Unleashing Potential Through Entreprene...,model_b,,1,1122,5164,5814,...,0,0,0,-0.069118,0.074009,0.053008,0.867659,0.838861,0.028798,"{'input_ids': [[tensor(2), tensor(235322), ten..."
20324,2206386265,Pretend you are a girl that is reacting to you...,"Mother: ""Dear, we need to talk about your outf...","Mother: ""Sweetie, I've picked out the perfect ...",model_b,,1,200,1028,939,...,0,0,0,-0.800000,0.241898,0.187642,0.731589,0.733099,-0.001510,"{'input_ids': [[tensor(2), tensor(235322), ten..."
57177,5ed610ae2199a489f0f03453d394061308e5fb48bacac1...,Какие автотрассировщики есть для Altium 24?,"Для Altium Designer 24, как и для предыдущих в...",В Altium Designer 24 для автоматической прокла...,model_a,Russian,0,43,1762,1317,...,0,0,0,0.000000,0.066667,0.000000,0.606370,0.792594,-0.186224,"{'input_ids': [[tensor(2), tensor(235322), ten..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86315,f9215e484f2f9520bf0060ecdf0d677699f721681a90bf...,"Help me rephrase the paragraph below: \n\n""""""\...",Here's a rephrased version of the paragraph:\n...,Certainly! Here's a rephrased version of the p...,model_a,English,0,1714,1268,1745,...,0,0,0,0.198052,0.053571,0.104345,0.700518,0.924211,-0.223693,"{'input_ids': [[tensor(2), tensor(235322), ten..."
38667,4234203318,What does it mean when a girl looks at you str...,"Ah, my dear anime fan, as a guru anime girl lo...",Oh ho ho! It seems you've caught the eye of a ...,model_a,,0,319,4909,2109,...,0,0,0,0.067500,0.086618,0.293948,0.748433,0.537206,0.211227,"{'input_ids': [[tensor(2), tensor(235322), ten..."
42446,10e2c4f5dedc8ab49f090b7de2c5996782f3233f757637...,What is tres leche,"Tres Leches Cake, or ""Pastel de Tres Leches"" i...",Tres leches is a Latin American dessert that t...,model_b,English,1,18,2867,949,...,0,0,0,0.000000,0.178636,0.219444,0.707107,0.643726,0.063380,"{'input_ids': [[tensor(2), tensor(235322), ten..."
22992,2493251791,Teach me about probability,Probability is a branch of mathematics that de...,Probability is a branch of mathematics that de...,model_b,,1,26,1471,2474,...,0,0,0,0.000000,0.100794,0.105864,0.570214,0.500000,0.070214,"{'input_ids': [[tensor(2), tensor(235322), ten..."


## Split

In [15]:
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=config.random_seed)

## Data Loader

In [16]:

#-------------------------------------------------------------------
class ChatbotArenaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, test=False, max_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = 2
        self.test = test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Tokenize the text
        #tokens = Utils.tokenize(self.tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=self.max_length)
        tokens = row['tokens']
        
        if not self.test:
            # Label
            label = torch.nn.functional.one_hot(torch.tensor(row['class_label']), num_classes=self.num_classes).float()
            #label = torch.tensor([row['class_label']]).float()

            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
                'label': label
            }
        else:
            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
            }



In [17]:
# Prepare dataset and dataloader
dataset_train = ChatbotArenaDataset(df_train, tokenizer, max_length=config.max_length)
dataloader_train = Utils.DataLoader(dataset_train, batch_size=config.train_batch, shuffle=True)

dataset_valid = ChatbotArenaDataset(df_valid, tokenizer, max_length=config.max_length)
dataloader_valid = Utils.DataLoader(dataset_valid, batch_size=config.eval_batch, shuffle=True)

## Model

In [18]:
quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    #bnb_4bit_compute_dtype=torch.bfloat16,
    #bnb_4bit_quant_type='nf4',
    #bnb_4bit_use_double_quant=False,
    )

model_base = AutoModelForSequenceClassification.from_pretrained(config.transformers_basemodel_path, 
            #torch_dtype=torch.float16,
            device_map=device, 
            quantization_config=quantization_config,
            )

model_base.config.use_cache = False
model_base = prepare_model_for_kbit_training(model_base)
lora_model = get_peft_model(model_base, lora_config)

# predictionModel = Utils.custom_load_model_chkpt(
#                         config,
#                         checkpointName="Original_notrain",
#                         device=device
#                         )

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
lora_model.print_trainable_parameters()


trainable params: 4,583,936 || all params: 2,618,930,432 || trainable%: 0.1750


In [20]:
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDic

In [21]:

#-------------------------------------------------------------------
# Evaluation (used for trainning)
def evaluate_model(model, dataloader, device="cuda"):
    model = model.to(device)
    model.eval()
    total_loss = 0.0
    correct = 0
    total_samples = 0

    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in tqdm(dataloader, total=len(dataloader), unit='row'):

            logits = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
            ).logits

            labels = batch['label'].to(device)  # One-hot encoded labels

            # Compute loss
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            # Compute predictions and accuracy
            normLogits = (logits>0.5).float()
            predictions = torch.argmax(logits, dim=1)  # Class with highest score
            true_labels = torch.argmax(labels, dim=1)  # Convert one-hot to class indices
            
            correct += (predictions == true_labels).sum().item()
            total_samples += labels.size(0)

    # Calculate average loss and accuracy
    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total_samples

    return {
        'loss': avg_loss,
        'accuracy': accuracy
    }



#-------------------------------------------------------------------
def train_model(model, dataloader, valid_dataloader, optimizer, config, scheduler=None,  device="cuda"):
    model = model.to(device)
    model.train()
    min_val_loss = float('inf') #checkpoint
    min_acc = 0
    history = {"train_accum_loss" : [], "train_accum_accuracy" : [], "valid_loss" : [], 
                "valid_accuracy" : []}
    history["best_epoch"]=0
    history["best_loss"]=0
    history["best_acc"]=0

    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(config.n_epochs):
        total_loss = 0
        model.train()
        correct = 0
        total_samples = 0
        
        for batch in tqdm(dataloader, total=len(dataloader), unit='row'):
            optimizer.zero_grad()
            
            logits = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
            ).logits
            
            # One-hot labels
            labels = batch['label'].to(device)
        
            loss = loss_fn(logits, labels)
        
            loss.backward()
            optimizer.step()
            if scheduler is not None:
                scheduler.step()
            
            total_loss += loss.item()
            
            # Compute predictions and accuracy
            normLogits = (logits>0.5).float()
            predictions = torch.argmax(logits, dim=1)  # Class with highest score
            true_labels = torch.argmax(labels, dim=1)  # Convert one-hot to class indices
            
            correct += (predictions == true_labels).sum().item()
            total_samples += labels.size(0)
        
        # add date and hour + epochs in checkpoint_name
        # Calculate average loss and accuracy
        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total_samples

        metrics = evaluate_model(model, valid_dataloader, device=device)
        
        print(f"Epoch {epoch + 1} Finished")
        print(f"Accumulated Train Loss: {avg_loss}")
        print(f"Accumulated Train Accuracy: {accuracy}")
        print(f"Valid Loss: {metrics['loss']}, Valid Accuracy : {metrics['accuracy']}")
        
    return history

## Train

In [22]:
optimizer = optim.AdamW(lora_model.parameters(), weight_decay=0.01)


In [23]:
history = train_model(lora_model, dataloader_train, dataloader_valid, optimizer, config, scheduler=None, device=device)

  0%|          | 0/79 [00:00<?, ?row/s]

  return fn(*args, **kwargs)
 33%|███▎      | 26/79 [01:28<02:59,  3.40s/row]


KeyboardInterrupt: 

In [22]:
Utils.plot_model_history(history, "Trainning History")

KeyError: 'config_name'

In [23]:
#!runpodctl remove pod $RUNPOD_POD_ID