## This is from https://www.kaggle.com/code/arabidopsisthalian/fine-tune

In [9]:
import os
import gc
import re
import random
from time import time
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import transformers
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification
from peft import get_peft_model, PeftModel, PeftConfig, get_peft_config, LoraConfig, TaskType
import torch.nn.functional as F

tqdm.pandas()

In [14]:
class CFG:
    NUM_EPOCH = 1
    BATCH_SIZE = 16
    DROPOUT = 0.05
    MODEL_NAME = 'meta-llama/Meta-Llama-3-8B'
    # MODEL_NAME = '/kaggle/input/llama-3/transformers/8b-hf/1'
    SEED = 2024
    MAX_LENGTH = 1024
    NUM_WARMUP_STEPS = 128
    LR_MAX = 5e-5
    NUM_LABELS = 3
    LORA_RANK = 16
    LORA_ALPHA = 32
    LORA_MODULES = ['q_proj', 'v_proj']


device = "cpu"

In [10]:
def set_seeds(seed):
    """Set seeds for reproducibility"""
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)


set_seeds(seed=CFG.SEED)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_eos_token = True

tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

In [16]:
def get_token_lengths(texts):
    # tokenize and receive input_ids for reach text
    input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in input_ids]

In [22]:
train: pd.DataFrame = pd.read_csv('../../kaggle/LLM_Classification_FineTuning/train.csv')


def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return ' '.join(sentences)


train.loc[:, 'prompt'] = train['prompt'].apply(process)  # safer than train['prompt'] = ...
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)

# Drop 'Nul' for training
indexes = train[(train.response_a == 'null') & (train.response_b == 'null')].index
train.drop(indexes, inplace=True)
train.reset_index(inplace=True, drop=True)

In [23]:
train.head(5)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0


In [24]:
train['text'] = 'User prompt: ' + train['prompt'] + '\n\nModel A :\n' + train[
    'response_a'] + '\n\n--------\n\nModel B:\n' + train['response_b']
print(train['text'][4])

User prompt: What is the best way to travel from Tel-Aviv to Jerusalem? Car? Bus? Plane?

Model A :
The best way to travel from Tel Aviv to Jerusalem depends on your personal preference and the availability of transportation options. All the options you have mentioned are valid options, but here are some details to help you make your decision:\n\n*   By car: Traveling by car is the quickest way to get from Tel Aviv to Jerusalem, as the distance between the two cities is only about 60 kilometers (37 miles). It takes around 45 minutes to drive from Tel Aviv to Jerusalem by car, depending on the traffic.\n*   By bus: There are several bus lines that run from Tel Aviv to Jerusalem, and the journey takes around 1 hour and 30 minutes by bus. The buses are comfortable and reliable, and they offer a scenic view of the beautiful Israeli countryside.\n*   By plane: There are no direct flights from Tel Aviv to Jerusalem, so you need to take a flight from Tel Aviv's Ben Gurion International Airpor

In [25]:
train.loc[:, 'token_count'] = get_token_lengths(train['text'])

In [26]:
train.head(5)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,text,token_count
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,User prompt: Is it morally right to try to hav...,1206
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0,User prompt: What is the difference between ma...,1393
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,User prompt: explain function calling. how wou...,664
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,User prompt: How can I create a test set for a...,1008
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,User prompt: What is the best way to travel fr...,479


In [27]:
train.loc[:, 'label'] = np.argmax(train[['winner_model_a', 'winner_model_b', 'winner_tie']].values, axis=1)

# Display data
display(train.head())

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,text,token_count,label
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,User prompt: Is it morally right to try to hav...,1206,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0,User prompt: What is the difference between ma...,1393,1
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,User prompt: explain function calling. how wou...,664,2
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,User prompt: How can I create a test set for a...,1008,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,User prompt: What is the best way to travel fr...,479,1


In [33]:
train.label.value_counts()

label
0    20061
1    19648
2    17749
Name: count, dtype: int64

In [37]:
train['token_count'].describe().to_frame().astype(int)

Unnamed: 0,token_count
count,57458
mean,732
std,790
min,17
25%,291
50%,566
75%,895
max,31035


In [38]:
np.percentile(train['token_count'], 90)

np.float64(1397.0)

In [40]:
tokens = tokenizer(
    train['text'].tolist(),
    padding='max_length',
    max_length=CFG.MAX_LENGTH,
    truncation=True,
    return_tensors='np'
)

# input IDs are the token IDs
INPUT_IDS = tokens['input_ids']
# Attention masks to Ignore padding tokens
ATTENTION_MASKS = tokens['attention_mask']

# Label of texts
LABELS = train[['winner_model_a', 'winner_model_b', 'winner_tie']].values

print(f'INPUT_IDS shape: {INPUT_IDS.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS.shape}')
print(f'LABELS shape: {LABELS.shape}')

INPUT_IDS shape: (57458, 1024), ATTENTION_MASKS shape: (57458, 1024)
LABELS shape: (57458, 3)


In [41]:
def train_dataset(batch_size):
    N_SAMPLES = LABELS.shape[0]
    IDXS = np.arange(N_SAMPLES - (N_SAMPLES % batch_size))
    while True:
        np.random.shuffle(IDXS)

        for idxs in IDXS.reshape(-1, batch_size):
            input_ids = torch.tensor(INPUT_IDS[idxs]).to(device)
            attention_mask = torch.tensor(ATTENTION_MASKS[idxs]).to(device)
            labels = torch.tensor(LABELS[idxs]).to(device)

            yield input_ids, attention_mask, labels


TRAIN_DATASET = train_dataset(CFG.BATCH_SIZE)

In [None]:
base_model = LlamaForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_label=CFG.NUM_LABELS,
    torch_dtype=torch.bfloat16
)

base_model.config.pretraining_tp = 1

base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
lora_config = LoraConfig(
    r=CFG.LORA_RANK,
    lora_alpha=CFG.LORA_ALPHA,
    lora_dropout=CFG.DROPOUT,
    bias='none',
    inference_mode=False,
    task_type=TaskType.SEQ_CLS,
    target_modules=CFG.LORA_MODULES
)

In [None]:
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

In [None]:
MODEL_LAYERS_ROWS = []
TRAINABLE_PARAMS = []
N_TRAINABLE_PARAMS = 0

for name, param in model.named_parameters():
    n_parameters = int(torch.prod(torch.tensor(param.shape)))

    if param.requires_grad:
        MODEL_LAYERS_ROWS.append({
            'param': n_parameters,
            'name': name,
            'dtype': param.data.dtype,
        })

        TRAINABLE_PARAMS.append({'param': param})

        N_TRAINABLE_PARAMS += n_parameters
display(pd.DataFrame(MODEL_LAYERS_ROWS))

print(f"""
===============================
N_TRAINABLE_PARAMS: {N_TRAINABLE_PARAMS:,}
N_TRAINABLE_LAYERS: {len(TRAINABLE_PARAMS)}
===============================
""")

In [None]:
N_SAMPLES = len(train)
STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE

OPTIMIZER = torch.optim.AdamW(model.parameters(), lr=CFG.LR_MAX)

# Cosine Learning Rate With Warmup
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer=OPTIMIZER,
    num_warmup_steps=CFG.NUM_WARMUP_STEPS,
    num_training_steps=STEPS_PER_EPOCH * CFG.NUM_EPOCH
)
print(f'BATCH_SIZE: {CFG.BATCH_SIZE}, N_SAMPLES: {N_SAMPLES}, STEPS_PER_EPOCH: {STEPS_PER_EPOCH}')

In [None]:
for state in OPTIMIZER.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor) and state[k].dtype is not torch.float32:
            state[k] = v.to(dtype=torch.float32)

input_ids, attention_mask, labels = next(TRAIN_DATASET)

print(f'input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}')
print(f'attention_mask shape: {attention_mask.shape}, dtype: {attention_mask.dtype}')
print(f'labels shape: {labels.shape}, dtype: {labels.dtype}')

In [None]:
%%time
# Dummy Prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
print(f'logits: {outputs.logits}, dtype: {outputs.logits.dtype}')

In [None]:
model.train()

# Loss function, Cross Entropy
LOSS_FN = torch.nn.CrossEntropyLoss().to(device=device, dtype=torch.float32)
st = time()
warnings.filterwarnings("error")
METRICS = {
    "loss": [],
    'accuracy': {'y_true': [], 'y_pred': []}
}

print(f'CFG.NUM_EPOCHS={CFG.NUM_EPOCH}, STEPS_PER_EPOCH={STEPS_PER_EPOCH}')

for epoch in tqdm(range(CFG.NUM_EPOCH)):
    ste = time()
    for step in range(STEPS_PER_EPOCH):
        # Zero out Gradient
        OPTIMIZER.zero_grad()

        input_ids, attention_mask, labels = next(TRAIN_DATASET)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits.to(dtype=torch.float32)

        loss = LOSS_FN(logits, labels.to(dtype=torch.float32))
        loss.backward()

        OPTIMIZER.step()

        lr_scheduler.step()

        METRICS['loss'].append(float(loss))
        METRICS['accuracy']['y_true'] += labels.squeeze().tolist()
        METRICS['accuracy']['y_pred'] += torch.argmax(F.softmax(logits, dim=-1), dim=-1).cpu().tolist()

        if (step + 1) % 200 == 0:
            metrics = 'µ_loss: {:.3f}'.format(np.mean(METRICS['loss']))
            metrics += ', step_loss: {:.3f}'.format(METRICS['loss'][-1])
            metrics += ', µ_auc: {:.3f}'.format(
                accuracy_score(torch.argmax(torch.tensor(METRICS['accuracy']['y_true']), dim=-1),
                               METRICS['accuracy']['y_pred']))
            lr = OPTIMIZER.param_groups[0]['lr']
            print(f'{epoch + 1:02}/{CFG.NUM_EPOCH:02} | {step + 1:04}/{STEPS_PER_EPOCH} lr: {lr:.2E}, {metrics}',
                  end='')
            print(f'\nSteps per epoch: {step + 1} complete | Time elapsed: {time() - st}')
    print(f'\nEpoch {epoch + 1} Completed | Total time for epoch: {time() - ste} ')

    torch.save(
        {k: v.cpu() for k, v in model.named_parameters() if v.requires_grad},
        f'model_llama_3_cp_{epoch + 1}_v1.pth'
    )

    torch.save(
        OPTIMIZER.state_dict(),
        f'optimizer_llama_3_cp_{epoch + 1}_v1.pth'
    )
