# Installation

In [None]:
%%capture
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
###
#from google.colab import drive
#drive.mount('/content/drive')
###

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
notebook_login() 

# Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 #2048 
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_dtype = torch.float16

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", #"unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    #dtype = dtype,
    dtype = model_dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                     "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# Data Prep

In [None]:
from datasets import load_dataset, DatasetDict
import json
from functools import partial
def format_prompt(example, tokenizer):
    try:
        target_json_str = json.dumps(example['target_json_output'], ensure_ascii=False)
    except TypeError as e:
        print(f"Error converting target_json_output to string: {e}")
        # Add more context if 'id' is part of your example from Label Studio
        # print(f"Problematic example ID (if available): {example.get('id')}")
        target_json_str = "{}" # Fallback to empty JSON string

    messages = [
        {"role": "system", "content": example.get('system_prompt', "You are a helpful assistant.")},
        {"role": "user", "content": example['user_review']},
        {"role": "assistant", "content": f"```json\n{target_json_str}\n```"} # This is the target JSON output
    ]
    # apply_chat_template returns the formatted string if tokenize=False
    formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": formatted_text}


def _tokenize_function(examples, tokenizer):
    output = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding="max_length",
        return_attention_mask=True,
        add_special_tokens=False # fix double tokens at beginning?
    )
    tokenized_inputs = {
        'input_ids': output['input_ids'],
        'attention_mask': output['attention_mask'],
    }
    tokenized_inputs["labels"] = output["input_ids"].copy()
    return tokenized_inputs



def create_training_dataset(jsonl_path, tokenizer, test_size = 0.2, seed=42):

    if tokenizer.chat_template is None:
        print("WARNING: tokenizer.chat_template is not set. Manually applying Llama 3.1 template.")
        tokenizer.chat_template = (
            "{% set loop_messages = messages %}"
            "{% for message in loop_messages %}"
                "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}"
                "{% if loop.index0 == 0 %}"
                    "{{ '<|begin_of_text|>' + content }}"
                "{% else %}"
                    "{{ content }}"
                "{% endif %}"
            "{% endfor %}"
        )
        # Also ensure padding token is set when manually setting chat_template
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token: {tokenizer.pad_token}")
    else:
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token: {tokenizer.pad_token}")
    
    print(f"Loading data from: {jsonl_path}")
    raw_dataset = load_dataset("json", data_files=jsonl_path)
    if 'train' not in raw_dataset:
        print("Error: dataset does not contain 'train' split. check jsonl structure")
        return None
    print(f"Dataset loaded with {len(raw_dataset['train'])} samples.")

    format_func_with_tokenizer = partial(format_prompt, tokenizer=tokenizer)

    
    print("Applying Llama 3.1 chat template formatting ...")
    formatted_dataset = raw_dataset.map(
        format_func_with_tokenizer, 
        remove_columns=['system_prompt', 'user_review', 'target_json_output'],
        num_proc=4 # Use multiple processes for faster mapping
    )
    #print(f"DEBUG: Features after formatting step: {formatted_dataset['train'].features}")
    tokenize_func_with_tokenizer = partial(_tokenize_function, tokenizer=tokenizer)

    print("Tokenizing the formatted dataset ...")
    tokenized_dataset = formatted_dataset.map(
        tokenize_func_with_tokenizer, # Use the partial function here
        batched=True,
        remove_columns=["text"],
        num_proc=4
    )
    #print(f"DEBUG: Features after tokenization step: {tokenized_dataset['train'].features}")

    print(f"Splitting dataset into train ({1-test_size:.0%}) and test ({test_size:.0%})...")
    # Perform the train-test split
    split_dataset = tokenized_dataset['train'].train_test_split(test_size=test_size, seed=seed)

    return DatasetDict({
        'train': split_dataset['train'],
        'test': split_dataset['test']
    })


train_filepath = './training_data.jsonl'
dataset = create_training_dataset(train_filepath, tokenizer, test_size=0.0)

if dataset:
    print("\nDataset successfully created and split:")
    print(dataset)
    #print("\nFeatures of the training dataset:")
    #print(dataset['train'].features)
    print("\nFirst training example:")
    # Decode to verify the actual text that will be fed to the model
    decoded_text_unnested = tokenizer.decode(dataset['train'][0]['input_ids'])
    print(decoded_text_unnested)

# Config Model Trainer

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'], 
    #eval_dataset = dataset['test'], 
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    #dataset_num_proc = 2,
    dataset_num_proc = None,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 10, # (3-5 for 1k) The number of training epochs(0 if the maximum steps are defined) 1
        max_steps = -1,  # The maximum steps (0 if the epochs are defined) 130
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        #output_dir = "outputs",
        output_dir = "/content/drive/MyDrive/my_llama_finetune_checkpoints",
        save_strategy = "steps",
        save_steps = 50,
        save_total_limit = 2,
        report_to="none",
    ),
)

# Train the model

In [None]:
trainer_stats = trainer.train()

# Save to HF (multiple options)

In [None]:
modelname = "GameReview-llama3.1-8b-v1"
hf_username = "MrMike42"
hf_username_model = f"{hf_username}/{modelname}"


# Merge then convert to GGUF then PUSH
# Very slow, needs lots of disk space, memory and GPU
# Completely finshes process
if False:
    print(f"Starting GGUF export and push to: {hf_username_modelname")
    model.push_to_hub_gguf(f"{hf_username_model}", tokenizer, quantization_method = "q4_k_m")
    print("done")
   

# push lora - FAST, but merging with base model and distill to GGUF still needed
# use this to quickly save work
if False:
# Use a descriptive name for your ADAPTERS repository to distinguish it from the GGUF model
    adapter_model_name = f"{modelname}-adapters"
    print(f"Pushing ONLY LoRA adapters to Hugging Face Hub: {hf_username_model}")
    # This pushes the small adapter weights, not the large GGUF
    trainer.push_to_hub(f"{hf_username_model}")
    print("done")
    
      

# push 16bit model from hosted directory - still needs to convert to GGUF
# this is case where model is no longer in memory
# (BUILT MODEL ON KAGGLE THEN RAN OUT OF DISK TRYING TO PUSH TO HF)
if False:
    from unsloth import FastLanguageModel
    import torch
    import os
    
    # Assuming your merged model is in this directory
    local_model_directory = "/kaggle/working/MrMike42/GameReview-llama3.1-8b-v2-GGUF" 
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Attempting to load model onto: {device}")
    
    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = local_model_directory,
        max_seq_length = 1024, # Use your original max_seq_length
        dtype = torch.float16, # Auto-detect, or torch.float16 if it was saved as 16bit
        load_in_4bit = False, # If you want to push the full 16-bit model
        device_map = device,
        )

    tokenizer = FastLanguageModel.from_pretrained(local_model_directory).tokenizer
    repo_id = hf_username_model
    model.push_to_hub(repo_id, token=True) # token=True uses the logged-in token
    tokenizer.push_to_hub(repo_id, token=True)
    print(f"Model successfully pushed to: https://huggingface.co/{repo_id}")


# push 16bit model from memory - still needs to convert to GGUF
# this is the case where the model is still in memory
if False:
    model.push_to_hub_merged(f"{hf_username_model}", tokenizer, save_method = "merge_16bit", token=True)
    print(f"Model successfully pushed to: https://huggingface.co/{repo_id}")  




# Save the trainer stats

In [None]:
with open("trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

# Save the finetuned model & push to HF

In [None]:
# Merge to 4bit
modelname = "GameReview-llama3.1-8b-v1"
hf_username = "MrMike42"
hf_username_model = f"{hf_username}/{modelname}"
#model.save_pretrained_gguf(modelname, tokenizer, quantization_method = "q4_k_m")
#trainer.push_to_hub("MrMike42/GameReview-llama3.1-8b-instruct-adapters")
model.push_to_hub_gguf(hf_username_model, tokenizer, quantization_method = "q4_k_m")#, token = "") #"hf_username/model"

In [None]:
#saving JUST the whole model

from huggingface_hub import HfApi, notebook_login, create_repo
import os
local_model_directory = "/kaggle/working/MrMike42/GameReview-llama3.1-8b-v2-GGUF" # <--- IMPORTANT: Adjust this to your actual directory
repo_id = "MrMike42/GameReview-llama3.1-8b-v2-direct-upload" # <--- IMPORTANT: Replace with your actual username and desired repo name
# Initialize HfApi
api = HfApi()

# 1. Create the repository on the Hugging Face Hub
# This step is important if the repo doesn't exist. `exist_ok=True` prevents errors if it already does.
try:
    create_repo(repo_id=repo_id, private=False, exist_ok=True, repo_type="model")
    print(f"Repository '{repo_id}' created or already exists.")
except Exception as e:
    print(f"Error creating/checking repository: {e}")
    # If it's an authentication error, notebook_login() might need to be re-run
    # or token might be expired/invalid.

# 2. Upload the entire folder to the repository
print(f"Uploading files from '{local_model_directory}' to '{repo_id}' without loading model into RAM...")
api.upload_folder(
    folder_path=local_model_directory,
    repo_id=repo_id,
    repo_type="model", # Specify it's a model repository
    commit_message="Upload merged 16-bit Llama 3.1 8B model (direct file upload)",
    # Use multi_commits=True for very large uploads to help with network stability
    # and resumption if an error occurs.
    
)

print(f"Model files successfully uploaded to: https://huggingface.co/{repo_id}")


# Run the model (optional)

#DOWNLOAD VIA OLLAMA

In [None]:
#ollama run hf.co/{username}/{repository}

Yes. If I have a bigger dataset, I would lower the learning rate to maximize chance of not settling in a local minimum. If you have small dataset, you want to do more epochs on it with big global batch size to limit overfitting. If you use sample packing, remember to bump up learning rate or you will effectively go over let's say 5x as many samples but have the same lr and steps, so effective learning rate per sample will be lower. If you use sample_packing and have short 100/200 token samples, this will be especially important.

Even when I not following best practices, for example by using global batch size 1 and high learning rate, my results were usually fine, so I don't think hyperparameters are very important for finetuning unless you majorly screw them up. Dataset is much more important. 

Dataset for DPO/ORPO should be at least 1000 samples and for SFT I would say over 5000 samples. Watch out for any biases in DPO/ORPO dataset that you may not want to have in final model - one good example is unalignment/toxic-dpo-0.1 - it contains a lot of numbered lists. A model trained on it, if you have a high enough learning rate/epoch count, will love to output numbered lists even in informal chats.

Its good to check current tech you're using periodically and switch to new better solutions. Unsloth, ORPO, GaLore, rslora, loftq+ - every few weeks a new thing comes out that can improve your finetune quality basically for free if you just incorporate it in the training.


5000 pairs should be enough for most tasks and I would say 1 or 2 epochs should be enough to improve on a base model.

If you have a small dataset (under 5k for sft) and you don't want to overfit, i think you should go with big batch size, high epochs and medium learning rate. So let's say 500 samples, 64 batch size, 30 epochs, 0.00005 lr (lr is very model dependant).

If you instead go for 500 samples, 4 batch size, 1 epoch, 0.0002 lr i think you are more likely to undershoot or overfit. It can still work, but I think there is more of a chance it won't. Each step has a much higher learning rate and less samples included, so you're more likely to train a model that does not converge to the global minimum.

