# Kaggle Predict

## Imports

In [32]:
import transformers as trsf
print("Transformers:", trsf.__version__)

Transformers: 4.47.1


In [33]:
import os
import copy
from dataclasses import dataclass
import pickle
from timeit import default_timer as timer
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    GemmaTokenizerFast,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    get_cosine_schedule_with_warmup,
)

from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

import ModelsUtils as Utils
import Configurations as Configs
#import wsdm_modelutils as Utils

In [34]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [35]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.5.1+cu118
Torch is build with CUDA: True
Torch device : cuda:0
------------------------------


In [36]:
device_ids = list(range(torch.cuda.device_count()))
device_ids

[0]

## Config

In [37]:
config_file = 'Configs.py'
manager = Configs.ConfigManager(config_file)

config = manager.micro

config.config_name

'micro_gemma2_2b_fp16_4bit'

In [38]:
basemodel_path = config.basemodel_path #config.basemodel_path #../BaseModel/gemma2_9b_fp16_4bit'
#basemodel_path = '/kaggle/input/gemma2_9b_4bit_fp32/other/default/1/gemma2_9b_4bit_fp32' #"unsloth/gemma-2-9b-it-bnb-4bit"

peft_model_path = '../Checkpoints/'

checkpoint_name = "01-24-2025_22-04_1024_train_lossBest"
#kaggle_PeftPath = '/kaggle/input/trainned_06/pytorch/default/1'

In [39]:
#dataframe_path = config.train_data

#df_test = pd.read_csv(dataframe_path)


df_train = pd.read_parquet('../Data/Original/wsdm-cup-multilingual-chatbot-arena/train.parquet', engine='pyarrow') # original
df_test = df_train.iloc[:10200].copy()
#df_test = df_train.iloc[:5100].copy()
#df_test = df_train.iloc[:1002].copy()
#df_test = df_train.iloc[:8].copy()

local_max_length = config.max_length
#local_max_length = 2048
local_batch_size = config.eval_batch
#local_batch_size = 8

df_test.shape

(10200, 8)

In [40]:
df_test['id'] = df_test['id'].astype(str)
df_test['prompt'] = df_test['prompt'].astype(str)
df_test['response_a'] = df_test['response_a'].astype(str)
df_test['response_b'] = df_test['response_b'].astype(str)

In [41]:
#df_test = df_test.sample(frac=config.sample_size, random_state=config.random_seed)


In [42]:
df_test.head(1)

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak


In [43]:
df_test.shape

(10200, 8)

In [44]:
df_test = df_test.apply(Utils.reencode, axis=1)  # Apply the make_pairs function to each row in df
#df_test = df_test.fillna('')

In [45]:
df_test["class_label"] = df_test.winner.map(Utils.name2label)


In [46]:
%%time

for col in ["prompt", "response_a", "response_b"]:
    df_test[f"{col}_len"] = df_test[f"{col}"].str.len()

df_test = Utils.extract_all_features(df_test)

Error processing document 727: empty vocabulary; perhaps the documents only contain stop words
Error processing document 1345: empty vocabulary; perhaps the documents only contain stop words
Error processing document 3289: empty vocabulary; perhaps the documents only contain stop words
Error processing document 3907: empty vocabulary; perhaps the documents only contain stop words
Error processing document 5182: empty vocabulary; perhaps the documents only contain stop words
Error processing document 6770: empty vocabulary; perhaps the documents only contain stop words
Error processing document 7439: empty vocabulary; perhaps the documents only contain stop words
Error processing document 7630: empty vocabulary; perhaps the documents only contain stop words
CPU times: total: 1min 13s
Wall time: 1min 13s


In [47]:
df_test.isnull().T.any().sum()

0

## Tokenize

In [48]:
tokenizer = GemmaTokenizerFast.from_pretrained(basemodel_path)
#tokenizer = AutoTokenizer.from_pretrained(basemodel_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [49]:
df_test.shape

(10200, 73)

In [50]:
#-------------------------------------------------------------------
# Tokenization function
def tokenize(tokenizer, prompt, response_a, response_b, max_length=256, spread_max_length=False):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]

    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, return_tensors="pt", truncation=True,  padding="max_length").input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, return_tensors="pt", truncation=True, padding="max_length").input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, return_tensors="pt", truncation=True, padding="max_length").input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        #tokenized = tokenizer(text, max_length=max_length, return_tensors="pt", padding=False, truncation=True) # padding=False
        tokenized = tokenizer(text, max_length=max_length, return_tensors="pt", padding="max_length", truncation=True) # padding=False
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        #'token_len': [len(item) for item in input_ids],
    }
    #return input_ids, attention_mask

In [51]:
%%time
def tokenize_df(row):
    return tokenize(tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=local_max_length)

df_test['tokens'] = df_test.apply(tokenize_df, axis=1)
#df_test["input_ids"], df_test["attention_mask"] = tokenize(tokenizer, df_test['prompt'], df_test['response_a'], df_test['response_b'], max_length=local_max_length)


CPU times: total: 31.1 s
Wall time: 30.6 s


In [52]:
df_test['len'] = df_test['prompt_len'] + df_test['response_a_len'] + df_test['response_b_len']


In [53]:
df_test = df_test.sort_values('len', ascending=False).reset_index(drop=True)
#df_test

## Data Loader

In [54]:
#-------------------------------------------------------------------
class ChatbotArenaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, test=False, max_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = 1
        self.test = test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Tokenize the text
        #tokens = Utils.tokenize(self.tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=self.max_length)
        tokens = row['tokens']

         # Extract features
        features = torch.tensor([], dtype=torch.float)

        for feat in Utils.feature_list_bycol:
            feat = torch.tensor([
                row[f'prompt{feat}'],
                row[f'response_a{feat}'],
                row[f'response_b{feat}']
                ], dtype=torch.float)
            features = torch.cat((features, feat))

        similarity_feat = torch.tensor([
                row[f'cosine_similarity_a'],
                row[f'cosine_similarity_b'],
                row[f'cosine_similarity_diff']
                ], dtype=torch.float)

        features = torch.cat((features, similarity_feat))

        rid = row['id']
        
        if not self.test:
            # Label
            label = torch.tensor([row['class_label']]).float()

            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
                'features': features,
                'label': label,
                'id' : rid
            }
        else:
            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
                'features': features,
                'id' : rid
            }

In [55]:
sub_0 = df_test.iloc[0::2].copy()
sub_1 = df_test.iloc[1::2].copy()

# Prepare dataset and dataloader
dataset_test_0 = ChatbotArenaDataset(sub_0, tokenizer, max_length=local_max_length, test=False)
dataloader_test_0 = Utils.DataLoader(dataset_test_0, batch_size=local_batch_size, shuffle=False)

dataset_test_1 = ChatbotArenaDataset(sub_1, tokenizer, max_length=local_max_length, test=False)
dataloader_test_1 = Utils.DataLoader(dataset_test_1, batch_size=local_batch_size, shuffle=False)

dataset_test_full = ChatbotArenaDataset(df_test, tokenizer, max_length=local_max_length, test=False)
dataloader_test_full = Utils.DataLoader(dataset_test_full, batch_size=local_batch_size, shuffle=False)

In [56]:
tost = next(iter(dataloader_test_0))['id']
###test.to('cuda:0')
tost

['03f17f43a1bf31be8fbc7ea000fbe1f4504e876df6e67a625ca1ae53e931516d',
 '30db249876adb6d4ecac30bdf2706a4171963efd763af6454eeefc3081f0195d']

## Model

In [57]:
def load_model(device):

    baseModel = AutoModel.from_pretrained(
                basemodel_path,
                torch_dtype=torch.float16,
                device_map=device,
                #quantization_config=quantization_config,
                )

    #baseModel.eval()
    #baseModel = prepare_model_for_kbit_training(baseModel)
    #baseModel.eval()

    kaggle_PeftPath = ""
    peftModelPath=f"{config.checkpoints_path}/{config.config_name}/"
    loadPath = kaggle_PeftPath if kaggle_PeftPath != "" else peftModelPath + checkpoint_name

    print(loadPath)

    # load peft from base
    loraModel_load = PeftModel.from_pretrained(
        baseModel,
        f'{loadPath}/PEFT',
        is_trainable=False,
        use_cache=False,
        device_map=device)

    #loraModel_load.load_adapter(f'{loadPath}/PEFT', adapter_name='default', device_map=device)

    #loraModel_load = loraModel_load.merge_and_unload(safe_merge=True)
    
    predictionModelLoaded = Utils.PreferencePredictionModel(
        loraModel_load,
        feature_dim=config.feature_dims,
        num_classes=config.num_classes,
        hidden_dim=config.hidden_dim,
        compute_feats=config.compute_feats
        )

    checkpoint = torch.load(f'{loadPath}/PreferencePredictionModel.pt', weights_only=True)

    predictionModelLoaded.feature_fc.load_state_dict(checkpoint['feature_fc_state_dict'])
    predictionModelLoaded.classifier.load_state_dict(checkpoint['classifier_state_dict'])
    return predictionModelLoaded

model_0 = load_model(f"cuda:{device_ids[0]}")
#model_1 = load_model(f"cuda:{device_ids[1]}")
#model_1
model_0

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


../Checkpoints/micro_gemma2_2b_fp16_4bit/01-24-2025_22-04_1024_train_lossBest


PreferencePredictionModel(
  (gemma_model): PeftModel(
    (base_model): LoraModel(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
     

## Predict

In [58]:
@torch.no_grad()
@torch.amp.autocast('cuda')
def predict_multi_gpu(model, dataloader, device_id=0):
    model = model.to(f"cuda:{device_id}")
    model.eval()
    
    predictions = []
    pred_ids = []

    total_loss = 0
    correct = 0
    total_samples = 0

    print(f"cuda:{device_id}")
    #loss_fn = nn.BCELoss()


    #with torch.amp.autocast():
    #with torch.no_grad():
    for batch in tqdm(dataloader, total=len(dataloader), unit='row') if device_id==0 else dataloader:
    #for batch in tqdm(dataloader, total=len(dataloader), unit='row'):
        # Move the batch to the primary device
        input_ids = batch['input_ids'].to(f"cuda:{device_id}")
        attention_mask = batch['attention_mask'].to(f"cuda:{device_id}")
        features = batch['features'].to(f"cuda:{device_id}")
        ids = batch['id']
        labels = batch['label'].to(f"cuda:{device_id}")
        # Perform inference
        logits = model(input_ids=input_ids, attention_mask=attention_mask, features=features)
        #logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits

        #loss = loss_fn(logits, labels)
        
        #total_loss += loss.item()

        # Normalize logits for binary classification
        norm_logits = (logits > 0.5).float()

        preds = norm_logits    #torch.argmax(logits, dim=1)  # Class with highest score
        true_labels = labels 
        correct += (preds == true_labels).sum().item()
        total_samples += labels.size(0)
        
        predictions.extend(norm_logits.cpu().tolist())
        pred_ids.extend(ids)

    #avg_loss = total_loss / len(current_loader)
    accuracy = correct / total_samples

    #print(f"Accumulated Loss: {avg_loss}")
    print(f"Accumulated Accuracy: {accuracy}")
    
    return {
        'winner': predictions, 
        'id': pred_ids, 
    }
    

In [None]:
from concurrent.futures import ThreadPoolExecutor

#with ThreadPoolExecutor(max_workers=2) as executor:
#    results_dict = executor.map(predict_multi_gpu, (model_0, model_1), (dataloader_test_0, dataloader_test_1), (0, 1))


predictions = predict_multi_gpu(model_0, dataloader_test_full, device_id=0)

cuda:0


  0%|          | 2/5100 [00:02<1:50:40,  1.30s/row]

In [29]:
#list(results_dict)

In [30]:
#result_df = pd.concat(list(results), axis=0)
#result_df

In [31]:
results = list(results_dict)

NameError: name 'results_dict' is not defined

In [None]:
res_0 = pd.DataFrame.from_dict(results[0])
res_1 = pd.DataFrame.from_dict(results[1])
res = pd.concat((res_0, res_1), ignore_index=True)
#res = res.reset_index()
res

In [None]:
def transcript(x):
    if x[0] > 0.5:
        return 'model_b'
    else:
        return 'model_a'

res['winner'] = res['winner'].apply(transcript)
res


In [None]:
sub_df = df_test[["id"]].copy()
sub_df = sub_df.merge(res, how='inner', on='id')
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)