#  Install libraries

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

# Import libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training, PeftConfig, PeftModel


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss

# For reproducibility

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False) # faster comput, more ram usage
torch.backends.cuda.enable_flash_sdp(False) # accuracy over speed on floating-point arithmetic

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Prepare data

In [None]:
df_train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
display(df_train.head(3))

In [None]:
# Concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

df_train.loc[:, 'prompt'] = df_train['prompt'].apply(process)
df_train.loc[:, 'response_a'] = df_train['response_a'].apply(process)
df_train.loc[:, 'response_b'] = df_train['response_b'].apply(process)

display(df_train.head(3))

In [None]:
# Prepare text for model
df_train['text'] = 'You are an expert at predicting user preference. \nUser prompt: ' + df_train['prompt'] +  '\n\nModel A :\n' + df_train['response_a'] +'\n\n--------\n\nModel B:\n'  + df_train['response_b']
print(df_train['text'][0])

# Parameter setting

In [None]:
model_id = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1' # model to finetune
from_ckpt = True  # load from ckeckpoint or not
peft_model_id = "/kaggle/input/lmsus-mistral-model/lmsys/model" # adapter
peft_tokenizer_id = "/kaggle/input/lmsus-mistral-model/lmsys/tokenizer"

MAX_LENGTH = 512 
NUM_TARGETS = 3

# Tokenize the text data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = 0
tokenizer.pad_token = tokenizer.eos_token

def tokenize_texts(texts, tokenizer, max_length=MAX_LENGTH):
    tokens = tokenizer(texts.tolist(), padding='max_length', max_length=max_length, truncation=True, return_tensors='pt')
    return tokens['input_ids'], tokens['attention_mask']


from torch.utils.data import Dataset
class TextDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels=None):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item
    

# Target columns to label
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
y = df_train[target_columns].idxmax(axis=1)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(df_train['text'], y_encoded, test_size=0.1, random_state=42)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

# Tokenize the texts
input_ids_train, attention_masks_train = tokenize_texts(x_train, tokenizer)
input_ids_val, attention_masks_val = tokenize_texts(x_val, tokenizer)
    
# Create the datasets
train_dataset = TextDataset(input_ids_train, attention_masks_train, torch.tensor(y_train))
val_dataset = TextDataset(input_ids_val, attention_masks_val, torch.tensor(y_val))

# Load model in 4-bit using BitsAndBytesConfig

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # quantize the model to 4-bits when you load it
    bnb_4bit_quant_type="nf4", # use a special 4-bit data type for weights initialized from a normal distribution
    bnb_4bit_use_double_quant=True, # use a nested quantization scheme to quantize the already quantized weights
    bnb_4bit_compute_dtype=torch.bfloat16, # use bfloat16 for faster computation
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,     
    num_labels=NUM_TARGETS,
    quantization_config=bnb_config, 
    device_map={"":0}
)


if from_ckpt:
    print("Load from checkpoint...")
    config = PeftConfig.from_pretrained(peft_model_id)
    model = PeftModel.from_pretrained(model, peft_model_id)
    

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8, 
    lora_alpha=32,  
    target_modules=["o_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ_CLS"
)


model = get_peft_model(model, config)
model.print_trainable_parameters()

# Trainer

In [None]:
output_dir = "lmsys"
ckpt_dir = os.path.join(output_dir, "checkpoints")
log_file = os.path.join(output_dir, "training_log.txt")

tokenizer.pad_token = tokenizer.eos_token

training_args = TrainingArguments(
    output_dir=ckpt_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    save_strategy="steps",
    save_steps=3,
    save_total_limit=3,
    logging_steps = 2,
    metric_for_best_model='accuracy',
    eval_strategy="steps",
    report_to = 'none',
    fp16=True,  # Enable mixed precision training
    gradient_checkpointing=True  # Enable gradient checkpointing
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

from datetime import datetime
from transformers import TrainerCallback
class LoggingCallback(TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            timestamp = datetime.now().strftime('%Y/%m/%d %H-%M-%S')
            with open(self.log_file, "a") as f:
                f.write(f"{timestamp} - Step: {state.global_step} - {logs}\n")
            print(f"{timestamp} - Step: {state.global_step} - {logs}")


In [None]:
import gc
gc.collect()

In [None]:
# fix: ValueError: Cannot handle batch sizes > 1 if no padding token is defined.
model.config.pad_token_id=50256

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,    
    compute_metrics=compute_metrics,
    callbacks=[LoggingCallback(log_file)]
)

model.config.use_cache = False

# train model
trainer.train()

# save model
tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))
model.save_pretrained(os.path.join(output_dir, "model"))