In [1]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/wsdm2025-package

Looking in links: /kaggle/input/pm-74300417-at-01-08-2025-22-41-03/, /kaggle/input/wsdm2025-package


In [2]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import polars as pl
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

# config

In [3]:
@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir = '/kaggle/input/wsdm2025-03-gemma2b-lora-train/wsdm2025-multilingual-chatbot-arena/lora_param/'
    max_length = 1900
    batch_size_list = [2, 16]
    tta = True
config = Config()

# Tokenizer

In [4]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.gemma_dir)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

# Read Data

In [5]:
INPUT_DIR = "/kaggle/input/wsdm-cup-multilingual-chatbot-arena"

# train = pd.read_parquet(f"{INPUT_DIR}/train.parquet")
test = pd.read_parquet(f"{INPUT_DIR}/test.parquet")

In [6]:
def process_text(text):
    return text.replace("null", "").strip()

In [7]:
def tokenize_test(row,tokenizer):
    prompt = ["<prompt>: " + process_text(t) for t in row["prompt"]]
    response_a = ["\n\n<response_a>: " + process_text(t) for t in row["response_a"]]
    response_b = ["\n\n<response_b>: " + process_text(t) for t in row["response_b"]]

    # Concatenate all parts into one text field for tokenization
    texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        
    # Tokenize the texts
    tokenized = tokenizer(texts, max_length=1024, truncation=True, padding=True)

    # input_ids = tokenized.input_ids
    # attention_mask = tokenized.attention_mask
    
    # return input_ids, attention_mask

    return tokenized

In [8]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=config.max_length):
    prompt = ["<prompt>: " + t for t in prompt]
    response_a = ["\n\n<response_a>: " + t for t in response_a]
    response_b = ["\n\n<response_b>: " + t for t in response_b]
    texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
    tokenized = tokenizer(texts, max_length=max_length, truncation=True)
    return tokenized['input_ids'], tokenized['attention_mask']

In [9]:
data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

In [10]:
# ds_test = Dataset.from_pandas(test)
# ds_test = ds_test.map(
#     tokenize_test,
#     fn_kwargs={'tokenizer': tokenizer},
#     batched=True,
# )

# # メモリ節約のために元のテキスト列を削除（オプション）
# ds_test = ds_test.remove_columns(['prompt', 'response_a', 'response_b'])

# preds_test = trainer.predict(ds_test).predictions

# Model Setting

In [11]:
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    config.gemma_dir,
    device_map=device_0,
    use_cache=False,
)
model_0.score = torch.nn.Linear(in_features=3584, out_features=3, bias=False).to(device_0)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    config.gemma_dir,
    device_map=device_1,
    use_cache=False,
)
model_1.score = torch.nn.Linear(in_features=3584, out_features=3, bias=False).to(device_1)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# Get peft
model_0 = PeftModel.from_pretrained(model_0, model_id=config.lora_dir).to(device_0) 
model_0.eval()

model_1 = PeftModel.from_pretrained(model_1, model_id=config.lora_dir).to(device_1)
model_1.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-15): 16 x Gemma2DecoderLayer(
            (self_attn): Gemma2SdpaAttention(
              (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
              (rotary_emb): Gemma2RotaryEmbedding()
            )
            (mlp): Gemma2MLP(
              (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=3584,

In [13]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size, max_length=config.max_length):
    winners = []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        winners.extend(proba[:, 1].tolist())
    
    df['winner'] = winners
    return df

  @torch.cuda.amp.autocast()


In [14]:
st = time.time()
# sort by input length to fully leverage dynaminc padding
if config.tta:
    data = pd.concat([data, aug_data]).reset_index(drop=True)
    data['index'] = np.arange(len(data), dtype=np.int32)
    data = data.sort_values("length", ascending=False)
else:
    data['index'] = np.arange(len(data), dtype=np.int32)
    data = data.sort_values("length", ascending=False)

data_dict = {}
data_dict[0] = data[data["length"] > 1024].reset_index(drop=True)
data_dict[1] = data[data["length"] <= 1024].reset_index(drop=True)
result_df = []
for i, batch_size in enumerate(Config.batch_size_list):
    if len(data_dict[i]) == 0:
        continue
    sub_1 = data_dict[i].iloc[0::2].copy()
    sub_2 = data_dict[i].iloc[1::2].copy()
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1), (batch_size, batch_size))
        
    result_df.append(pd.concat(list(results), axis=0))

result_df = pd.concat(result_df).sort_values('index').reset_index(drop=True)
if config.tta:
    proba = result_df['winner'].values[:len(aug_data)]
    tta_proba = 1 - result_df['winner'].values[len(aug_data):]
    proba = (proba + tta_proba) / 2
    result_df = result_df[:len(aug_data)].copy()
    result_df['winner'] = proba
print(f"elapsed time: {time.time() - st}")

elapsed time: 4.961940288543701


In [15]:
submission_df = result_df[['id', 'winner']].copy()
submission_df['winner'] = np.where(submission_df['winner'] > 0.5, 'model_a', 'model_b')
submission_df.to_csv('submission.csv', index=False)
display(submission_df[['id', 'winner']])

Unnamed: 0,id,winner
0,327228,model_a
1,1139415,model_b
2,1235630,model_a


In [16]:
# prob = torch.from_numpy(preds_test).float().softmax(dim=-1).numpy()
# class_mapping = {0: 'model_a', 1: 'model_b'} 
# test["winner"] = np.argmax(prob, axis=1)
# test['winner'] = test['winner'].map(class_mapping)


In [17]:
# sub=test[["id","winner"]]
# sub.head()

In [18]:
# sub.to_csv("submission.csv",index=False)

In [1]:
## studying docker