# Kaggle Predict

## Imports

In [1]:
import transformers as trsf
print("Transformers:", trsf.__version__)

Transformers: 4.47.1


In [2]:
import os
import copy
from dataclasses import dataclass
import pickle
from timeit import default_timer as timer
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import Tensor

#from datasets import Dataset

from tqdm import tqdm

from transformers import (
    BitsAndBytesConfig,
    GemmaTokenizerFast,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    get_cosine_schedule_with_warmup,
)

from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

#import ModelsUtils as Utils
import wsdm_modelutils as Utils

In [3]:
import peft as pft
print("Peft:", pft.__version__)

Peft: 0.14.0


In [4]:
print('Torch version:', torch.__version__)
print('Torch is build with CUDA:', torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Torch device : {device}')
print('------------------------------')

Torch version: 2.4.1+cu121
Torch is build with CUDA: True
Torch device : cuda:0
------------------------------


In [5]:
device_ids = list(range(torch.cuda.device_count()))
device_ids

[0, 1]

## Config

In [6]:
config_file = '/kaggle/input/configs/Configs.py'
manager = Utils.ConfigManager(config_file)

config = manager.gemma2_9b_fp16_4bit_h1536

config.config_name

'gemma2_9b_fp16_4bit_h1536'

In [7]:
basemodel_path = '/kaggle/input/gemma2_9b_fp16_4bit/transformers/default/1' #config.basemodel_path #../BaseModel/gemma2_9b_fp16_4bit'
#basemodel_path = '/kaggle/input/gemma2_9b_4bit_fp32/other/default/1/gemma2_9b_4bit_fp32' #"unsloth/gemma-2-9b-it-bnb-4bit"

checkpoint_name = "Trainned"
kaggle_PeftPath = '/kaggle/input/trainned_2/pytorch/default/1'

In [8]:
df_test = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet', engine='pyarrow') # original

#df_train = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet', engine='pyarrow') # original
#df_test = df_train.iloc[:10200].copy()
#df_test = df_train.iloc[:102].copy()

#local_max_length = config.max_length
local_max_length = 1950
#local_batch_size = config.eval_batch
local_batch_size = 4

df_test.shape

(3, 5)

In [9]:
df_test['prompt'] = df_test['prompt'].astype(str)
df_test['response_a'] = df_test['response_a'].astype(str)
df_test['response_b'] = df_test['response_b'].astype(str)

In [10]:
df_test = df_test.apply(Utils.reencode, axis=1)  # Apply the make_pairs function to each row in df
df_test = df_test.fillna('')

In [11]:
%%time

for col in ["prompt", "response_a", "response_b"]:
    df_test[f"{col}_len"] = df_test[f"{col}"].str.len()

df_test = Utils.extract_all_features(df_test)

CPU times: user 84.1 ms, sys: 7.21 ms, total: 91.3 ms
Wall time: 101 ms


## Tokenize

In [12]:
tokenizer = GemmaTokenizerFast.from_pretrained(basemodel_path)
tokenizer.add_eos_token = True      # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [13]:
df_test.shape

(3, 69)

In [14]:
#-------------------------------------------------------------------
# Tokenization function
def tokenize(tokenizer, prompt, response_a, response_b, max_length=256, spread_max_length=False):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]

    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, return_tensors="pt", truncation=True,  padding="max_length").input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, return_tensors="pt", truncation=True, padding="max_length").input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, return_tensors="pt", truncation=True, padding="max_length").input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        #tokenized = tokenizer(text, max_length=max_length, return_tensors="pt", padding=False, truncation=True) # padding=False
        tokenized = tokenizer(text, max_length=max_length, return_tensors="pt", padding="max_length", truncation=True) # padding=False
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        #'token_len': [len(item) for item in input_ids],
    }
    #return input_ids, attention_mask

In [15]:
%%time
def tokenize_df(row):
    return tokenize(tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=local_max_length)

df_test['tokens'] = df_test.apply(tokenize_df, axis=1)
#df_test["input_ids"], df_test["attention_mask"] = tokenize(tokenizer, df_test['prompt'], df_test['response_a'], df_test['response_b'], max_length=local_max_length)


CPU times: user 11.9 ms, sys: 1.73 ms, total: 13.6 ms
Wall time: 12.6 ms


In [16]:
df_test['len'] = df_test['prompt_len'] + df_test['response_a_len'] + df_test['response_b_len']


In [17]:
df_test = df_test.sort_values('len', ascending=False).reset_index(drop=True)
df_test

Unnamed: 0,id,prompt,response_a,response_b,scored,encode_fail,prompt_len,response_a_len,response_b_len,response_a_spaces,...,prompt_curly_balance,prompt_json,prompt_sentiment,response_a_sentiment,response_b_sentiment,cosine_similarity_a,cosine_similarity_b,cosine_similarity_diff,tokens,len
0,327228,Caso Clínico: Un hombre de 70 años con anteced...,**Diagnóstico Diferencial de Anemia en Pacient...,"Basándonos en el caso clínico presentado, pode...",False,False,855,1961,3412,322,...,0,0,-0.325,0.15,0.0,0.80195,0.683307,0.118643,"{'input_ids': [[tensor(2), tensor(235322), ten...",6228
1,1235630,Há um grave problema com o relógio da torre da...,Dois problemas interessantes!\n\n**Problema 1:...,Vamos resolver os dois problemas em sequência....,False,False,802,1997,1658,381,...,0,0,0.0,0.029688,0.15,0.667627,0.691822,-0.024195,"{'input_ids': [[tensor(2), tensor(235322), ten...",4457
2,1139415,Peel Company received a cash dividend from a ...,The correct answer is **(a) No No**. Here's ...,The correct answer is **(a) No No**. Here's wh...,False,False,278,893,831,142,...,0,0,0.366667,0.257143,0.386667,0.610054,0.616745,-0.006691,"{'input_ids': [[tensor(2), tensor(235322), ten...",2002


## Data Loader

In [18]:

#-------------------------------------------------------------------
class ChatbotArenaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, test=False, max_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = 2
        self.test = test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Tokenize the text
        #tokens = Utils.tokenize(self.tokenizer, [row['prompt']], [row['response_a']], [row['response_b']], max_length=self.max_length)
        tokens = row['tokens']

         # Extract features
        features = torch.tensor([], dtype=torch.float)

        for feat in Utils.feature_list_bycol:
            feat = torch.tensor([
                row[f'prompt{feat}'],
                row[f'response_a{feat}'],
                row[f'response_b{feat}']
                ], dtype=torch.float)
            features = torch.cat((features, feat))

        similarity_feat = torch.tensor([
                row[f'cosine_similarity_a'],
                row[f'cosine_similarity_b'],
                row[f'cosine_similarity_diff']
                ], dtype=torch.float)

        features = torch.cat((features, similarity_feat))
        
        if not self.test:
            # Label
            label = torch.nn.functional.one_hot(torch.tensor(row['class_label']), num_classes=self.num_classes).float()
            #label = torch.tensor([row['class_label']]).float()

            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
                'features': features,
                'label': label,
                'id' : row['id']
            }
        else:
            return {
                'input_ids': tokens['input_ids'].squeeze(0),
                'attention_mask': tokens['attention_mask'].squeeze(0),
                'features': features,
                'id' : row['id']
            }

In [19]:
sub_0 = df_test.iloc[0::2].copy()
sub_1 = df_test.iloc[1::2].copy()

# Prepare dataset and dataloader
dataset_test_0 = ChatbotArenaDataset(sub_0, tokenizer, max_length=local_max_length, test=True)
dataloader_test_0 = Utils.DataLoader(dataset_test_0, batch_size=local_batch_size, shuffle=False)

dataset_test_1 = ChatbotArenaDataset(sub_1, tokenizer, max_length=local_max_length, test=True)
dataloader_test_1 = Utils.DataLoader(dataset_test_1, batch_size=local_batch_size, shuffle=False)

## Model

In [20]:
def load_model(device):
    quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,  # bfloat16 is recommended
                bnb_4bit_use_double_quant=False,
                bnb_4bit_quant_type='nf4',
                )

    baseModel = AutoModel.from_pretrained(
                basemodel_path,
                #torch_dtype=torch.float16,
                device_map=device,
                quantization_config=quantization_config,
                use_cache=False,
                )

    baseModel.eval()
    baseModel = prepare_model_for_kbit_training(baseModel)
    baseModel.eval()

    baseModel.use_cache=False
    
    peftModelPath=f"{config.checkpoints_path}/{config.config_name}/"
    loadPath = kaggle_PeftPath if kaggle_PeftPath != "" else peftModelPath + checkpoint_name

    print(loadPath)

    # load peft from base
    loraModel_load = PeftModel.from_pretrained(
        baseModel,
        f'{loadPath}/PEFT',
        is_trainable=False)

    loraModel_load = loraModel_load.merge_and_unload()
    loraModel_load.use_cache=False

    predictionModelLoaded = Utils.PreferencePredictionModel(
        loraModel_load,
        feature_dim=config.feature_dims,
        num_classes=config.num_classes,
        hidden_dim=config.hidden_dim
        )

    predictionModelLoaded.use_cache=False
    
    checkpoint = torch.load(f'{loadPath}/PreferencePredictionModel.pt', weights_only=True)

    predictionModelLoaded.feature_fc.load_state_dict(checkpoint['feature_fc_state_dict'])
    predictionModelLoaded.classifier.load_state_dict(checkpoint['classifier_state_dict'])
    return predictionModelLoaded

model_0 = load_model(f"cuda:{device_ids[0]}")
model_1 = load_model(f"cuda:{device_ids[1]}")
#model_1
model_0

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

/kaggle/input/trainned_2/pytorch/default/1


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

/kaggle/input/trainned_2/pytorch/default/1


PreferencePredictionModel(
  (gemma_model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (post_atte

## Predict

In [21]:
def predict_multi_gpu(model, dataloader, device_id=0):
    model = model.to(f"cuda:{device_id}")
    model.eval()
    
    predictions = []
    pred_ids = []

    print(f"cuda:{device_id}")

    start_timer = timer()

    with torch.amp.autocast(f"cuda:{device_id}", enabled=True):
        with torch.no_grad():
            for batch in tqdm(dataloader, total=len(dataloader), unit='row') if device_id==0 else dataloader:
            #for batch in tqdm(dataloader, total=len(dataloader), unit='row'):
                # Move the batch to the primary device
                input_ids = batch['input_ids'].to(f"cuda:{device_id}")
                attention_mask = batch['attention_mask'].to(f"cuda:{device_id}")
                features = batch['features'].to(f"cuda:{device_id}")
                ids = batch['id']
                # Perform inference
                logits = model(input_ids=input_ids, attention_mask=attention_mask, features=features)
                #logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
            
                # Normalize logits for binary classification
                norm_logits = (logits > 0.5).float()
                predictions.extend(norm_logits.cpu().tolist())
                pred_ids.extend(ids.tolist())

    return {
        'winner': predictions, 
        'id': pred_ids, 
    }
    

In [22]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=2) as executor:
    results_dict = executor.map(predict_multi_gpu, (model_0, model_1), (dataloader_test_0, dataloader_test_1), (0, 1))


#predictions = predict_multi_gpu(model_0, dataloader_test, device_id=0)

cuda:0


  0%|          | 0/1 [00:00<?, ?row/s]

cuda:1


100%|██████████| 1/1 [00:06<00:00,  6.49s/row]


In [23]:
results = list(results_dict)

res_0 = pd.DataFrame.from_dict(results[0])
res_1 = pd.DataFrame.from_dict(results[1])
res = pd.concat((res_0, res_1))
res

Unnamed: 0,winner,id
0,[1.0],327228
1,[0.0],1139415
0,[0.0],1235630


In [24]:
def transcript(x):
    if x[0] > 0.5:
        return 'model_b'
    else:
        return 'model_a'

res['winner'] = res['winner'].apply(transcript)
res


Unnamed: 0,winner,id
0,model_b,327228
1,model_a,1139415
0,model_a,1235630


In [25]:
sub_df = df_test[["id"]].copy()
sub_df = sub_df.merge(res, how='outer', on='id')
sub_df.head()

Unnamed: 0,id,winner
0,327228,model_b
1,1235630,model_a
2,1139415,model_a


In [26]:
sub_df.to_csv("submission.csv", index=False)