In [1]:
!pip install accelerate==0.33.0
!pip install bitsandbytes==0.43.3
!pip install peft==0.12.0 
!pip install transformers==4.44.0

Collecting accelerate==0.33.0
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-0.33.0
Collecting bitsandbytes==0.43.3
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting peft==0.1

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

VER=157

# FINAL SOLUTION IS USE_QLORA=FALSE, TRAIN_100_PERCENT=TRUE, ADD_33K=TRUE, DEBUG=FALSE
USE_QLORA = True
TRAIN_100_PERCENT = False
ADD_33K = False
DEBUG = True

In [3]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

In [4]:
@dataclass
class Config:
    output_dir: str = f"output-{VER}"
    checkpoint: str = "/kaggle/input/gemma2-9b-it-fp16"  
    max_length: int = 2048
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4  # global batch size is 8 
    per_device_eval_batch_size: int = 4
    n_epochs: int = 1
    freeze_layers: int = 0 # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_steps: int = 20
    lora_r: int = 64
    lora_alpha: float = 4 
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    
config = Config()

In [5]:
training_args = TrainingArguments(
    output_dir = f"output-{VER}",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="no", # don't save any checkpoints
    #save_steps=200,
    optim=config.optim_type,
    fp16=True, 
    #bf16=False,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,

    #gradient_checkpointing=True, # this doesn't work correctly for some reason

    #logging_first_step=True,
    #lr_scheduler_type='linear', # "cosine" or "linear" or "constant" (default is linear)
    metric_for_best_model='log_loss',
    greater_is_better=False,  
    #save_total_limit=4,
    #load_best_model_at_end=True,
)

In [6]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj",
                    "down_proj","up_proj","o_proj","gate_proj"],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
    modules_to_save=["score","classifier_head1", "classifier_head2"]
)

In [7]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [8]:
qlora = {}
if USE_QLORA:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4", #nf4 or fp4
        bnb_4bit_use_double_quant = False,
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_skip_modules = ["score","classifier_head1", "classifier_head2"]
    )
    qlora['quantization_config'] = bnb_config
    print("Using QLoRA")

Using QLoRA


In [9]:
import torch
import torch.nn as nn
from transformers import Gemma2ForSequenceClassification, Gemma2Config

class CustomGemma2ForSequenceClassification(Gemma2ForSequenceClassification):
    def __init__(self, config, num_labels_head1=60, num_labels_head2=60):
        super().__init__(config)
        self.num_labels_head1 = num_labels_head1
        self.num_labels_head2 = num_labels_head2
        self.classifier_head1 = nn.Linear(config.hidden_size, num_labels_head1, bias=False)
        self.classifier_head2 = nn.Linear(config.hidden_size, num_labels_head2, bias=False)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        device = input_ids.device

        if labels is not None:
            labels = labels.to(device)
            outputs = super().forward(input_ids, attention_mask=attention_mask, labels=labels[:, 0], output_hidden_states=True)
        else:
            outputs = super().forward(input_ids, attention_mask=attention_mask)

        last_token_indices = (torch.sum(attention_mask, dim=1) - 1).to(device)
        last_token_outputs = outputs.hidden_states[-1].to(device)[
            torch.arange(outputs.hidden_states[-1].shape[0], device=device), last_token_indices]

        outputs_head1 = self.classifier_head1(last_token_outputs).to(device)
        outputs_head2 = self.classifier_head2(last_token_outputs).to(device)

        if labels is not None:
            labels_head1 = labels[:, 1].to(device)
            labels_head2 = labels[:, 2].to(device)
            
            loss_head1 = nn.CrossEntropyLoss()(outputs_head1, labels_head1)
            loss_head2 = nn.CrossEntropyLoss()(outputs_head2, labels_head2)
            loss = outputs.loss.to(device) + 0.1 * loss_head1 + 0.1 * loss_head2
            return {"loss": loss, "logits": (outputs.logits, outputs_head1, outputs_head2)}
        else:
            return {"logits": (outputs.logits, outputs_head1, outputs_head2)}

config2 = Gemma2Config.from_pretrained(config.checkpoint)
config2.num_labels = 2
model = CustomGemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    config=config2,
    num_labels_head1=60,
    num_labels_head2=60,
    torch_dtype=torch.float16,
    device_map="auto",
    **qlora
)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of CustomGemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma2-9b-it-fp16 and are newly initialized: ['classifier_head1.weight', 'classifier_head2.weight', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): CustomGemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector):

In [10]:
model.print_trainable_parameters()

trainable params: 216,509,440 || all params: 9,458,652,672 || trainable%: 2.2890


In [11]:
import pandas as pd

df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet") 
df["id"] = df["id"].astype("str")
print('Competition data has shape', df.shape )
LN = len(df)
df.head(1)

Competition data has shape (48439, 8)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak


In [12]:
if ADD_33K:
    df = pd.concat([df,df2],axis=0,ignore_index=True)
if DEBUG:
    df = df.iloc[:64].copy()
print("We will use train data with shape", df.shape )

We will use train data with shape (64, 8)


In [13]:
import numpy as np
m1 = df.model_a.unique()
m2 = df.model_b.unique()
m = np.union1d(m1,m2)
m = sorted(m)
print(f"There are {len(m)} unique models:")

MAP = {x:y for x,y in zip(m,range(len(m)))}
print(MAP)

df.model_a = df.model_a.map(MAP).astype('int32')
df.model_b = df.model_b.map(MAP).astype('int32')
df.head(1)

There are 47 unique models:
{'chatgpt-4o-latest-20240808': 0, 'chatgpt-4o-latest-20240903': 1, 'claude-3-5-sonnet-20240620': 2, 'claude-3-5-sonnet-20241022': 3, 'claude-3-haiku-20240307': 4, 'claude-3-opus-20240229': 5, 'command-r-08-2024': 6, 'command-r-plus-08-2024': 7, 'deepseek-v2.5': 8, 'gemini-1.5-flash-002': 9, 'gemini-1.5-flash-8b-001': 10, 'gemini-1.5-flash-8b-exp-0827': 11, 'gemini-1.5-flash-exp-0827': 12, 'gemini-1.5-pro-001': 13, 'gemini-1.5-pro-002': 14, 'gemini-1.5-pro-exp-0827': 15, 'gemma-2-27b-it': 16, 'gemma-2-2b-it': 17, 'gemma-2-9b-it': 18, 'gpt-4-0125-preview': 19, 'gpt-4-1106-preview': 20, 'gpt-4-turbo-2024-04-09': 21, 'gpt-4o-2024-05-13': 22, 'gpt-4o-2024-08-06': 23, 'gpt-4o-mini-2024-07-18': 24, 'grok-2-2024-08-13': 25, 'grok-2-mini-2024-08-13': 26, 'internlm2_5-20b-chat': 27, 'jamba-1.5-mini': 28, 'llama-3.1-405b-instruct-bf16': 29, 'llama-3.1-405b-instruct-fp8': 30, 'llama-3.1-70b-instruct': 31, 'llama-3.1-8b-instruct': 32, 'llama-3.1-nemotron-70b-instruct': 3

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,38,42,Slovak


In [14]:
ds = Dataset.from_pandas(df)

In [15]:
import json

class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def prepare_text(self, prompts, responses_a, responses_b):
        
        rounds = [
            f"<start_of_turn>prompt\n{prompts}<end_of_turn>\n"
            +f"<start_of_turn>response_a\n{responses_a}<end_of_turn>\n"
            +f"<start_of_turn>response_b\n{responses_b}<end_of_turn>"
        ]
        
        # for k in range(len(rounds)):
        #     tmp = "\n".join(rounds[k:])
        #     if len( self.tokenizer(tmp)["input_ids"] ) < self.max_length: 
        #         break
        tmp = rounds[0]
        
        return tmp
        
    def __call__(self, batch: dict) -> dict:
        
        texts = [
            self.prepare_text(p, r_a, r_b)
            for p, r_a, r_b in zip(batch["prompt"], batch["response_a"], batch["response_b"])
        ]
        
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels=[]
        for win, c, d in zip(batch["winner"], 
                                   batch["model_a"],batch["model_b"]):
            if win == 'model_a':
                label = 0
            elif win == 'model_b':
                label = 1
            labels.append( (label,c,d) )
        return {**tokenized, "labels": labels} #, "texts": texts}

In [16]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True, num_proc=8)

  self.pid = os.fork()


Map (num_proc=8):   0%|          | 0/64 [00:00<?, ? examples/s]

In [17]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = np.array( eval_preds.label_ids )
    
    # Split the predictions and labels into two heads
    preds_head1 = preds[0]
    preds_head2 = preds[1]
    preds_head3 = preds[2]
    labels_head1 = labels[:,0]
    labels_head2 = labels[:,1]
    labels_head3 = labels[:,2]
    
    # Compute log loss and accuracy for each head
    probs_head1 = torch.from_numpy(preds_head1).float().softmax(-1).numpy()
    loss_head1 = log_loss(y_true=labels_head1, y_pred=probs_head1, labels=[x for x in range(2)])
    acc_head1 = accuracy_score(y_true=labels_head1, y_pred=preds_head1.argmax(-1))
    
    probs_head2 = torch.from_numpy(preds_head2).float().softmax(-1).numpy()
    loss_head2 = log_loss(y_true=labels_head2, y_pred=probs_head2, labels=[x for x in range(60)])
    acc_head2 = accuracy_score(y_true=labels_head2, y_pred=preds_head2.argmax(-1))

    probs_head3 = torch.from_numpy(preds_head3).float().softmax(-1).numpy()
    loss_head3 = log_loss(y_true=labels_head3, y_pred=probs_head3, labels=[x for x in range(60)])
    acc_head3 = accuracy_score(y_true=labels_head3, y_pred=preds_head3.argmax(-1))
    
    # Return the metrics for each head
    return {
        "acc_classify": acc_head1,
        "log_loss_classify": loss_head1,
        "acc_model_a": acc_head2,
        "log_loss_model_a": loss_head2,
        "acc_model_b": acc_head3,
        "log_loss_model_b": loss_head3
    }

In [18]:
if TRAIN_100_PERCENT:
    folds = [
        (
            [i for i in range(len(ds))], 
            [i for i in range(len(ds)) if (i % config.n_splits == fold_idx)&(i<LN)]
        ) 
        for fold_idx in range(config.n_splits)
    ]
    print("We are training with 100% data")
else:
    folds = [
        (
            [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
            [i for i in range(len(ds)) if (i % config.n_splits == fold_idx)&(i<LN)]
        ) 
        for fold_idx in range(config.n_splits)
    ]    

In [19]:
train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Acc Classify,Log Loss Classify,Acc Model A,Log Loss Model A,Acc Model B,Log Loss Model B
0,No log,3.254423,0.461538,2.209075,0.076923,5.105834,0.0,5.348514


TrainOutput(global_step=6, training_loss=4.134893417358398, metrics={'train_runtime': 439.3599, 'train_samples_per_second': 0.116, 'train_steps_per_second': 0.014, 'total_flos': 3365690881093632.0, 'train_loss': 4.134893417358398, 'epoch': 0.9230769230769231})

In [None]:
trainer.save_model(f"LoRA-v{VER}")