In [None]:
# Install the required version of Hugging Face Hub
!pip install -U huggingface-hub==0.27.1

import wandb
from kaggle_secrets import UserSecretsClient

# Initialize Kaggle Secrets
user_secrets = UserSecretsClient()

# Retrieve API tokens from Kaggle Secrets
hf_token = user_secrets.get_secret("HF_TOKEN")
wb_token = user_secrets.get_secret("WANDB_TOKEN")

# Log in to Weights & Biases (W&B)
wandb.login(key=wb_token)

# Log in to Hugging Face
from huggingface_hub import login
login(token=hf_token)


In [None]:
# Install required dependencies (if needed)
# !pip install -U huggingface-hub==0.27.1 transformers==4.46.1 datasets==3.1.0 torch==2.5.1+cu121 bitsandbytes==0.45.0

import os
import json
from datasets import load_dataset
import torch

# Set the working directory
os.chdir('/kaggle/working/')

# Remove any existing LLaMA-Factory directory and clone a fresh copy
os.system('rm -rf LLaMA-Factory')
os.system('git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git')
os.chdir('LLaMA-Factory')

# Install required dependencies
!pip install -q torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
!pip uninstall -q -y jax
!pip install -q -e .[torch,bitsandbytes,liger-kernel]

# Verify if CUDA is available
try:
    assert torch.cuda.is_available()
except AssertionError:
    print("Please set up a GPU before using LLaMA Factory")


# Hyperparameter Configuration

In [None]:
stage = "kto"  # Training stage: Kahneman-Tversky Optimization (KTO)
data_size = 40000  # Number of training samples
base_model = "Llama-3.2-1B-Instruct"  # Pretrained base model used for fine-tuning
dataset_name = "Muadil/kto_labeled_openai_summary"  # Dataset path in Hugging Face Datasets
batch_size = 8  # Number of samples per training batch
epoch_size = 1  # Number of training epochs

In [None]:
# Loading the dataset from Hugging Face
dataset = load_dataset(dataset_name)

# Selecting the first 1,000 data points (or adjusted size based on data_size)
resized_dataset = dataset["train"].select(range(min(int(data_size*1.25), len(dataset["train"]))))

# Splitting the dataset into training (80%) and evaluation (20%) sets
train_test_split = resized_dataset.train_test_split(test_size=0.2, seed=42)

# Extracting the "prompt" column for training and evaluation datasets
train_dataset = train_test_split["train"]["prompt"]
eval_dataset = train_test_split["test"]["prompt"]

# Saving the training dataset to a JSON file
output_file = f"data/{stage}_dataset_{data_size}.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(train_dataset, f, indent=2, ensure_ascii=False)

# Saving the evaluation dataset to a JSON file
output_file = f"data/{stage}_eval_dataset_{data_size}.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(eval_dataset, f, indent=2, ensure_ascii=False)

import json

# Reading the dataset information file to update it
with open('data/dataset_info.json', 'r', encoding='utf-8') as f:
    dataset_info = json.load(f)

# Adding new data for training dataset
new_data = {
    f"{stage}_dataset_{data_size}": {
        "file_name": f"{stage}_dataset_{data_size}.json",
        "formatting": "sharegpt",  # Format used for the data
        "columns": {
            "messages": "messages",  # Column representing the messages
            "kto_tag": "label"  # Column representing the labels for classification
        },
        "tags": {
            "role_tag": "role",  # Tag for the role in the conversation
            "content_tag": "content",  # Tag for the message content
            "user_tag": "user",  # Tag for user-related content
            "assistant_tag": "assistant"  # Tag for assistant-related content
        }
    }
}

# Adding new evaluation data
new_eval_data = {
    f"{stage}_eval_dataset_{data_size}": {
        "file_name": f"{stage}_dataset_{data_size}.json",
        "formatting": "sharegpt",  # Format used for the data
        "columns": {
            "messages": "messages",  # Column representing the messages
            "kto_tag": "label"  # Column representing the labels for classification
        },
        "tags": {
            "role_tag": "role",  # Tag for the role in the conversation
            "content_tag": "content",  # Tag for the message content
            "user_tag": "user",  # Tag for user-related content
            "assistant_tag": "assistant"  # Tag for assistant-related content
        }
    }
}

# Updating the existing dataset information with new data
dataset_info.update(new_data)
dataset_info.update(new_eval_data)

# Saving the updated dataset information back to the file
with open('data/dataset_info.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_info, f, indent=2, ensure_ascii=False)

# Outputting a success message
print("The new data was successfully added and the file was saved.")

# Outputting the saved dataset file path
print(f"The first {data_size} of data was successfully saved to {output_file}.")


In [None]:

import json
args = dict(
  quantization_bit=4, #nf4 to do quantization, default is None so it does not quantize.
  quantization_method="bitsandbytes", # we enter in which format to quantize. bitsandbytes, hqq, eetq, etc.
  stage=stage,
  # kto_chosen_weight = 1, # default value
  # pref_beta = 0.1, # default value
  kto_rejected_weight = 1.33,
  do_train=True,
  model_name_or_path=f"meta-llama/{base_model}", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  dataset=f"{stage}_dataset_{data_size}",             # use alpaca and identity datasets
  # eval_dataset = f"{stage}_eval_dataset_{data_size}",
  template="llama3",                     # use llama3 prompt template
  finetuning_type="lora",                   # use LoRA adapters to save memory
  lora_target="all",                     # attach LoRA adapters to all linear layers
  output_dir="llama3_lora",                  # the path to save LoRA adapters
  per_device_train_batch_size=batch_size,               # the batch size
  gradient_accumulation_steps=4,               # the gradient accumulation steps
  lr_scheduler_type="cosine",                 # use cosine learning rate scheduler
  logging_steps=10,                      # log every 10 steps
  warmup_ratio=0.1,                      # use warmup scheduler
  save_steps=1000,                      # save checkpoint every 1000 steps
  learning_rate=5e-5,                     # the learning rate
  num_train_epochs=epoch_size,                    # the epochs of training
  max_samples=500,                      # use 500 examples in each dataset
  max_grad_norm=1.0,                     # clip gradient norm to 1.0
  loraplus_lr_ratio=16.0,                   # use LoRA+ algorithm with lambda=16.0
  fp16=True,                         # use float16 mixed precision training
  enable_liger_kernel=True,                   # use liger kernel for efficient training
  # report_to="wandb",
  run_name=f"Muadil/{base_model}_sum_{int(data_size/1000)}k_{batch_size}_{epoch_size}ep",
  # load_best_model_at_end=True,  # Ensure the best model is loaded at the end
  # metric_for_best_model="eval_loss",  # Metric to monitor for the best model
  greater_is_better=False,
  # evaluation_strategy="steps",
)

json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)
# Start the training process
!llamafactory-cli train train_llama3.json



In [None]:
# Saving and exporting the model
export_args = dict(
    model_name_or_path=f"meta-llama/{base_model}",  # Path to the original base model
    adapter_name_or_path="llama3_lora",  # Path to the saved LoRA adapters
    template="llama3",  # Template used for the model (keeps the same format)
    finetuning_type="lora",  # Specifies the type of fine-tuning (using LoRA)
    export_dir="llama3_lora_merged",  # Directory where the merged model will be saved
    export_size=5,  # Size of the model file in GB
    export_device="cpu",  # The device where the model will be exported (in this case, CPU)
    export_hub_model_id="username/repository",  # Hugging Face model repository ID for uploading
)

# Exporting the model by saving the configuration to a JSON file
json.dump(export_args, open("merge_llama3.json", "w", encoding="utf-8"), indent=2)

# Using the LLaMA Factory CLI tool to export the model with the provided configuration
!llamafactory-cli export merge_llama3.json
