In [1]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    Gemma2Model,
    GemmaTokenizerFast,
    Gemma2Config,
    AutoTokenizer,
    AutoModel,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraModel, PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda
------------------------------


## Config

In [3]:
MINI_RUN = True
SEED = 707
ORIGINAL = True

In [4]:
runType = "MINI" if MINI_RUN else "FULL"
original = "original" if ORIGINAL else "custom"

In [5]:
#---------------------------------------------------------------------------------------------------
class Config:
    output_dir: str = "output"
    checkpoint: str = "unsloth/gemma-2-9b-it-bnb-4bit"  # 4-bit quantized gemma-2-9b-instruct
    max_length: int = 256
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 2  # global batch size is 8 
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_steps: int = 20
    lora_r: int = 16
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

config = Config()

## LoRA Config

In [6]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
)

___________________________________________________________________________

## Data

In [7]:
df = pd.read_csv(f'../Data/Preprocessed/train_preprocessed_{runType}_{original}.csv')
df.head(1)

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,encode_fail,class_label,prompt_len,response_a_len,response_b_len
0,6e09f2c1136cd1540b627393f8ec1f583618a68af19b4a...,tạo cho tôi filter giá tăng giần hoặc giảm giầ...,Dưới đây là một ví dụ về cách tạo bộ lọc giá t...,"```jsx\nimport React, { useState, useEffect } ...",model_b,claude-3-5-sonnet-20240620,gemini-1.5-pro-002,Vietnamese,False,1,73,2063,3469


In [8]:
#def process_text(text: str) -> str:
#    return " ".join(eval(text, {"null": ""}))
#
#df.loc[:, 'prompt'] = df['prompt'].apply(process_text)
#df.loc[:, 'response_a'] = df['response_a'].apply(process_text)
#df.loc[:, 'response_b'] = df['response_b'].apply(process_text)
#
#display(df.head(5))

In [9]:
df['prompt'] = df['prompt'].astype(str)
df['response_a'] = df['response_a'].astype(str)
df['response_b'] = df['response_b'].astype(str)

## Tokenize

In [10]:
%%time

#tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-multilingual-gemma2')
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

CPU times: total: 891 ms
Wall time: 1.03 s


In [11]:
stop

NameError: name 'stop' is not defined

## Model

### Base Model (BAAI/bge-multilingual-gemma2)

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
#"google/gemma-2-9b-it" "BAAI/bge-multilingual-gemma2"
#model = Gemma2Model.from_pretrained("BAAI/bge-multilingual-gemma2", quantization_config=quantization_config)
gemma_base = AutoModel.from_pretrained('BAAI/bge-multilingual-gemma2', 
            torch_dtype=torch.float16, 
            device_map="cpu", 
            #quantization_config=quantization_config
            )

#gemma_base = AutoModel.from_pretrained('BAAI/bge-multilingual-gemma2', 
#            torch_dtype=torch.float16, 
#            device_map="auto", 
#            #quantization_config=quantization_config
#            )


Loading checkpoint shards: 100%|██████████| 4/4 [01:38<00:00, 24.67s/it]


In [None]:
#save base model

save_path = '../BaseModel/bge-multilingual-gemma2-fp16'
#save_path = '../BaseModel/bge-multilingual-gemma2-fp16-4bit'

gemma_base.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('../BaseModel/bge-multilingual-gemma2-fp16\\tokenizer_config.json',
 '../BaseModel/bge-multilingual-gemma2-fp16\\special_tokens_map.json',
 '../BaseModel/bge-multilingual-gemma2-fp16\\tokenizer.model',
 '../BaseModel/bge-multilingual-gemma2-fp16\\added_tokens.json',
 '../BaseModel/bge-multilingual-gemma2-fp16\\tokenizer.json')

In [None]:
gemma_base.config.use_cache = False
gemma_base = prepare_model_for_kbit_training(gemma_base)
lora_model = get_peft_model(gemma_base, lora_config)
#lora_model

In [None]:
lora_model.print_trainable_parameters()

trainable params: 12,730,368 || all params: 9,254,443,520 || trainable%: 0.1376


In [None]:
predictionModel_original = Utils.PreferencePredictionModel(gemma_model=lora_model, feature_dim=4, num_classes=2)

In [None]:
predictionModel_original.gemma_model.base_model

LoraModel(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256002, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=3584, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=3584, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=3584, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
           

## Save

In [None]:
Utils.custom_save_model_chkpt(predictionModel_original, savePath="../Checkpoints/", checkpointName="test3")

In [None]:
optimizer = optim.Adam(predictionModel_original.parameters())

## Load

In [12]:
quantization_config = BitsAndBytesConfig(load_in_4bit=False)

predictionModelLoaded = Utils.custom_load_model_chkpt(
                        baseModelPath='../BaseModel/bge-multilingual-gemma2-fp16', 
                        peftModelPath="../Checkpoints/",
                        checkpointName="test3", 
                        quantization_config=quantization_config, 
                        optimizer=None)


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  7.46it/s]
  checkpoint = torch.load(f'{loadPath}/PreferencePredictionModel.pt')


In [13]:
predictionModelLoaded.gemma_model.print_trainable_parameters()

trainable params: 12,730,368 || all params: 9,254,443,520 || trainable%: 0.1376


In [14]:
predictionModelLoaded

PreferencePredictionModel(
  (gemma_model): PeftModelForFeatureExtraction(
    (base_model): LoraModel(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256002, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
         