## Imports

In [1]:
# gemma-2 is available from transformers>=4.42.3

import transformers as trsf
print("Transformers:", trsf.__version__)

#!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Transformers: 4.47.1


In [2]:
import os
import copy
from dataclasses import dataclass
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    Gemma2Model,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModel,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils
import Configurations as Configs
#import wsdm_modelutils as Utils

In [3]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [4]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


## Config

In [5]:
config_file = 'Configs.py'
manager = Configs.ConfigManager(config_file)

config = manager.micro_test
#config = manager.runpod_1
#load_from_config = manager.save_load_gemma2_2b_fp16
#load_from_config = manager.save_load_gemma2_2b_fp16_hidden_512
config.config_name

'micro_gemma2_2b_fp16_4bit'

### Paths

In [6]:
# for AutoModel and auto tokenizer
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16'
#base_model_path = '/kaggle/input/bge-multilingual-gemma2-fp16/pytorch/default/1/bge-multilingual-gemma2-fp16-4bit'
base_model_path = config.basemodel_path


# peft pre saved locally
#peft_model_path = '/kaggle/input/peftchkpt_original/pytorch/default/1/'
peft_model_path = '../Checkpoints/'
checkpoint_name = "Original_notrain"

#df = pd.read_csv('/kaggle/input/wsdm-preprocessed-full-original/train_preprocessed_FULL_original.csv')
#dataframe_path = '/kaggle/input/train-preprocessed-mini-original/train_preprocessed_MINI_original.csv'
dataframe_path = config.train_data


## Trainning Args

In [7]:
#training_args = TrainingArguments(
#    output_dir="output",
#    overwrite_output_dir=True,
#    report_to="none",
#    num_train_epochs=config.n_epochs,
#    per_device_train_batch_size=config.train_batch,
#    gradient_accumulation_steps=config.gradient_accumulation_steps,
#    per_device_eval_batch_size=config.eval_batch,
#    logging_steps=10,
#    eval_strategy="epoch",
#    save_strategy="steps",
#    save_steps=200,
#    optim=config.optim_type,
#    fp16=True,
#    learning_rate=config.lr,
#    warmup_steps=config.warmup_steps,
#)

## LoRA Config

In [8]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
)

___________________________________________________________________________

## Data

In [9]:
df = pd.read_csv(dataframe_path)
df.head(1)

  df = pd.read_csv(dataframe_path)


Unnamed: 0,id,prompt,response_a,response_b,winner,language,class_label,prompt_len,response_a_len,response_b_len,...,prompt_chinese,prompt_round_balance,prompt_curly_balance,prompt_json,prompt_sentiment,response_a_sentiment,response_b_sentiment,cosine_similarity_a,cosine_similarity_b,cosine_similarity_diff
0,53567,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,model_b,,1,192,3096,3592,...,0.0,0,0,0,0.077778,0.058469,0.139458,0.705321,0.629803,0.075518


In [10]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Tokenize

In [11]:
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [12]:
df = df.sample(frac=config.sample_size, random_state=config.random_seed)

In [13]:
df.shape

(8765, 70)

## Split

In [14]:
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=config.random_seed)

## Data Loader

In [38]:
# Prepare dataset and dataloader
dataset_train = Utils.ChatbotArenaDataset(df_train, tokenizer, max_length=2048)
dataloader_train = Utils.DataLoader(dataset_train, batch_size=config.train_batch, shuffle=True)

dataset_valid = Utils.ChatbotArenaDataset(df_valid, tokenizer, max_length=2048)
dataloader_valid = Utils.DataLoader(dataset_valid, batch_size=config.eval_batch, shuffle=True)

In [39]:
len(dataloader_train)

3944

In [40]:
dataset_train[0]

{'input_ids': tensor([     2, 235322,  39038,  ...,      0,      0,      0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'features': tensor([ 1.1900e+02,  9.6200e+02,  4.6900e+02,  2.3000e+01,  2.1800e+02,
          1.0600e+02,  1.0000e+00,  1.4000e+01,  4.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  4.0000e+00,  3.0000e+00,  1.0000e+00,  2.2000e+01,
          3.4000e+01,  0.0000e+00,  0.0000e+00,  8.0000e+00,  0.0000e+00,
          0.0000e+00,  8.0000e+00,  0.0000e+00,  1.0000e+01,  9.0000e+00,
          0.0000e+00,  1.0000e+01,  9.0000e+00,  2.5000e+01,  3.0900e+02,
          2.3600e+02,  4.0000e+00,  5.7000e+01,  6.3000e+01,  7.3950e-01,
          6.0499e-01,  3.4968e-01,  1.6807e-02,  1.4553e-02,  1.2793e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.7500e

## Model

In [41]:
predictionModel = Utils.custom_load_model_chkpt(
                        config,
                        checkpointName="Original_notrain",
                        device=device
                        )

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [42]:
predictionModel

PreferencePredictionModel(
  (gemma_model): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
 

In [43]:
optimizer = optim.AdamW([
    {'params': predictionModel.gemma_model.parameters(), 'lr': 2e-6},     # Lower learning rate for transformer layers
    {'params': predictionModel.classifier.parameters(), 'lr': 1e-3},      # Higher learning rate for custom layers
], weight_decay=0.01)

#optimizer = optim.AdamW(weight_decay=0.01)
#optimizer = optim.Adam(predictionModel.parameters())

## Train

In [44]:
batch = next(iter(dataloader_train))


In [45]:
feats = batch['features'].to(device)
feats

tensor([[ 1.7700e+03,  1.5870e+03,  2.4040e+03,  2.1300e+02,  2.0500e+02,
          3.4500e+02,  5.6000e+01,  4.4000e+01,  5.6000e+01,  1.0000e+00,
          0.0000e+00,  0.0000e+00,  6.0000e+01,  0.0000e+00,  2.0000e+01,
          2.1000e+01,  2.0000e+00,  4.0000e+00,  1.7000e+01,  5.0000e+00,
          1.9000e+01,  5.0000e+00,  0.0000e+00,  0.0000e+00,  5.0000e+00,
          0.0000e+00,  0.0000e+00,  4.2000e+01,  2.0000e+00,  1.6000e+01,
          4.2000e+01,  2.0000e+00,  1.6000e+01,  5.0800e+02,  2.9600e+02,
          5.7400e+02,  1.5000e+01,  1.1000e+01,  6.0000e+00,  6.5480e-01,
          6.8683e-02,  1.3977e-01,  2.7119e-02,  2.1424e-02,  2.6206e-02,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  7.0000e+00,
          2.0000e+00,  2.0000e+00, -5.0000e-01,  0.0000e+00, -5.3333e-01,
          3.1565e-01,  5.6146e-01, -2.4582e-01],
        [ 1.3600e+02,  6.4100e+02,  1.4300e+03,  2.1000e+01,  1

In [46]:
labels = batch['label'].to(device)
labels

tensor([[1.],
        [1.]], device='cuda:0')

In [47]:
labels[:, -1]

tensor([1., 1.], device='cuda:0')

In [48]:
predictionModel.to(device)
predictionModel.eval()
with torch.no_grad():
    logits = predictionModel(
                    input_ids=batch['input_ids'].to(device),
                    attention_mask=batch['attention_mask'].to(device),
                    features=batch['features'].to(device)
                )

In [49]:
logits

tensor([[0.4935],
        [0.4934]], device='cuda:0')

In [50]:
logits[:, -1]

tensor([0.4935, 0.4934], device='cuda:0')

In [51]:
normLogits = (logits>0.5).float()
normLogits

tensor([[0.],
        [0.]], device='cuda:0')

In [52]:
predictions = normLogits
true_labels = labels

In [53]:
labels.size(0)

2

In [54]:
predictions

tensor([[0.],
        [0.]], device='cuda:0')

In [55]:
true_labels

tensor([[1.],
        [1.]], device='cuda:0')

In [56]:
(predictions == true_labels).sum().item() / labels.size(0)

0.0

In [57]:
loss = nn.BCELoss()(logits, labels)
loss

tensor(0.7063, device='cuda:0')

In [58]:
predictionModel.gemma_model.config.hidden_size

2304

In [59]:
tokenizer.decode(batch['input_ids'][0])

'<bos><prompt>: Что делает этот скрипт?\n\nimport datetime\nimport json\nfrom time import sleep\nfrom typing import Optional\n\nfrom fastapi import FastAPI, HTTPException, Request, Header\nfrom fastapi.responses import RedirectResponse\nfrom deta import Deta\nimport telebot\n\nfrom bot import bot\nfrom config import config\n\nHOST = config.get(\'host\', \'https://0hkqon.deta.dev\')  # \'https://0hkqon.deta.dev\', \'https://y99nge.deta.dev\'\n\ndeta = Deta()\ndb = deta.Base("stats2")\napp = FastAPI()\n\n\n@app.get("/")\ndef read_root():\n\treturn RedirectResponse(\'/docs\')\n\n\n@app.get("/debug")\ndef go_to_visor():\n\treturn RedirectResponse(\'https://web.deta.sh/home/baterflyrity/default/micros/parking_bot_demo\')\n\n\n@app.get(\'/hook\')\ndef set_webhook():\n\tbot.remove_webhook()\n\tsleep(0.1)\n\tbot.set_webhook(url=f\'{HOST}/reply\')\n\treturn \'Webhook was set.\'\n\n\n@app.get(\'/unhook\')\ndef remove_webhook():\n\tbot.remove_webhook()\n\treturn \'Webhook was removed.\'\n\n\n@app

In [60]:
stop

NameError: name 'stop' is not defined

In [None]:
history = Utils.train_model(predictionModel, dataloader_train, dataloader_valid, optimizer, config, scheduler=None, device=device)

In [None]:
Utils.plot_model_history(history, "Trainning History")

In [None]:
#!runpodctl remove pod $RUNPOD_POD_ID