In [1]:
import time
started_time = time.time()

In [2]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/transformers-4.42.3-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/accelerate-0.32.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes, accelerate, transformers, peft
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed accelerate-0.32.1 bitsandbytes-0.43.1 peft-0.11.1 transformers-4.42.3


In [3]:
LORA_DIR = "/kaggle/input/peft-weights-lmsys/checkpoint-2440"

In [4]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel
# from tqdm import tqdm

assert torch.cuda.device_count() == 2

TIME_LIMIT = 9 * 3600  # 8.5 hours in seconds

@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir = LORA_DIR
    max_length = 1856
    batch_size = 4
    device = torch.device("cuda")    
    tta = True  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

2024-08-05 13:42:09.820224: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 13:42:09.820323: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 13:42:09.924344: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load & pre-process Data 

In [5]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

# display(test.head(5))

# Tokenize

In [6]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [7]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(LORA_DIR)
# tokenizer.add_eos_token = True
# tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

CPU times: user 675 ms, sys: 125 ms, total: 800 ms
Wall time: 992 ms


# Load model

In [8]:
import torch.nn as nn
class CustomClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.hidden_size, config.hidden_size*2, bias=False)
        self.fc2 = nn.Linear(config.hidden_size*2, config.hidden_size // 2, bias=False)
        self.fc3 = nn.Linear(config.hidden_size // 2, config.num_labels, bias=False)
    
    def forward(self, features):
        x= self.fc1(features)
        x= self.fc2(x)
        x= self.fc3(x)
        return x

In [9]:
# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)
model_0.resize_token_embeddings(len(tokenizer))

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)
model_1.resize_token_embeddings(len(tokenizer))

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding(256000, 3584, padding_idx=0)

In [10]:
model_0.score = CustomClassificationHead(model_0.config)
model_0.score = model_0.score.to('cuda:0')

model_1.score = CustomClassificationHead(model_1.config)
model_1.score = model_1.score.to('cuda:1')

#### Load LoRA adapter

In [11]:
model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

# Inference


In [12]:
# # Cache for LoRA_A x LoRA_B results
# lora_cache = {}
# @torch.no_grad()
# @torch.cuda.amp.autocast()
# def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
#     a_win, b_win, tie = [], [], []
    
#     for start_idx in range(0, len(df), batch_size):
#         end_idx = min(start_idx + batch_size, len(df))
#         tmp = df.iloc[start_idx:end_idx]
#         input_ids = tmp["input_ids"].to_list()
#         attention_mask = tmp["attention_mask"].to_list()
        
#         inputs = pad_without_fast_tokenizer_warning(
#             tokenizer,
#             {"input_ids": input_ids, "attention_mask": attention_mask},
#             padding="longest",
#             pad_to_multiple_of=None,
#             return_tensors="pt",
#         )
        
#         # Create a cache key based on inputs
#         cache_key = tuple(inputs['input_ids'].cpu().numpy().flatten())
        
#         if cache_key in lora_cache:
#             proba = lora_cache[cache_key]
#         else:
#             outputs = model(**inputs.to(device))
#             proba = outputs.logits.softmax(-1).cpu()
#             lora_cache[cache_key] = proba
        
#         a_win.extend(proba[:, 0].tolist())
#         b_win.extend(proba[:, 1].tolist())
#         tie.extend(proba[:, 2].tolist())
    
#     df["winner_model_a"] = a_win
#     df["winner_model_b"] = b_win
#     df["winner_tie"] = tie
    
#     return df

In [13]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
#     progress_bar = tqdm(total=len(df), desc=f"Inference on {device}", unit="sample")
    
    for start_idx in range(0, len(df), batch_size):
        if time.time() - started_time > TIME_LIMIT:
            print("Time limit reached, stopping inference.")
            break
            
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
        
#         progress_bar.update(len(tmp))
        
#     progress_bar.close()

    df["winner_model_a"] = [-1.0] * len(df)
    df["winner_model_b"] = [-1.0] * len(df)
    df["winner_tie"] = [-1.0] * len(df)
    
    #print(len(a_win))
    #print(len(df['winner_model_a']))
    
    df.loc[df.index[: len(a_win)], "winner_model_a"] = a_win
    df.loc[df.index[: len(b_win)], "winner_model_b"] = b_win
    df.loc[df.index[: len(tie)], "winner_tie"] = tie
    return df

In [14]:
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()
# sub_1 = data.iloc[:len(data)//2:].copy()
# sub_2 = data.iloc[len(data)//2::].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

elapsed time: 4.785816669464111


In [15]:
st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()
#     sub_1 = data.iloc[:len(data)//2:].copy()
#     sub_2 = data.iloc[len(data)//2::].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    
    proba_length = proba.shape[0]
    #tta_proba_length = tta_proba.shape[0]
    #completed_flags = [True] * tta_proba_length + [False] * (proba_length - tta_proba_length)
    #tta_proba[:, [0, 1]] = tta_proba[:, [1, 0]]
    result_array = proba.copy()
    for i in range(proba.shape[0]):
        if tta_proba[i][0] !=-1.0: #completed_flags[i]:
            result_array[i] = (proba[i] + tta_proba[i]) / 2
        else:
            result_array[i] = proba[i]
            
    # average original result and TTA result.
    #proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

elapsed time: 4.2168309688568115


In [16]:
result_df.loc[:, "winner_model_a"] = result_array[:, 0]
result_df.loc[:, "winner_model_b"] = result_array[:, 1]
result_df.loc[:, "winner_tie"] = result_array[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
2,1233961,0.117177,0.68762,0.195203
0,136060,0.002575,0.976472,0.020953
1,211333,0.346457,0.316414,0.33713
