# Installation

In [None]:
from huggingface_hub import notebook_login
notebook_login() 

In [None]:
#THIS IS FOR KAGGLE
if False:
    !pip install pip3-autoremove
    !pip-autoremove torch torchvision torchaudio -y
    !pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
    !pip install unsloth

    print("---------Cell Done")

In [None]:
#THIS IS FOR COLAB


%%capture
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

# Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 #2048 
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model_dtype = torch.float16

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", #"unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    #dtype = dtype,
    dtype = model_dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

print("---------Cell Done")


# LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                     "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print("---------Cell Done")

# Data Prep

In [None]:
from datasets import load_dataset, DatasetDict
import json
from functools import partial
def format_prompt(example, tokenizer):
    try:
        target_json_str = json.dumps(example['target_json_output'], ensure_ascii=False)
    except TypeError as e:
        print(f"Error converting target_json_output to string: {e}")
        # Add more context if 'id' is part of your example from Label Studio
        # print(f"Problematic example ID (if available): {example.get('id')}")
        target_json_str = "{}" # Fallback to empty JSON string

    messages = [
        {"role": "system", "content": example.get('system_prompt', "You are a helpful assistant.")},
        {"role": "user", "content": example['user_review']},
        {"role": "assistant", "content": f"```json\n{target_json_str}\n```"} # This is the target JSON output
    ]
    # apply_chat_template returns the formatted string if tokenize=False
    formatted_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    return {"text": formatted_text}


def _tokenize_function(examples, tokenizer):
    output = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
        padding="max_length",
        return_attention_mask=True,
        add_special_tokens=False # fix double tokens at beginning?
    )
    tokenized_inputs = {
        'input_ids': output['input_ids'],
        'attention_mask': output['attention_mask'],
    }
    tokenized_inputs["labels"] = output["input_ids"].copy()
    return tokenized_inputs



def create_training_dataset(jsonl_path, tokenizer, test_size = 0.2, seed=42):

    if tokenizer.chat_template is None:
        print("WARNING: tokenizer.chat_template is not set. Manually applying Llama 3.1 template.")
        tokenizer.chat_template = (
            "{% set loop_messages = messages %}"
            "{% for message in loop_messages %}"
                "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}"
                "{% if loop.index0 == 0 %}"
                    "{{ '<|begin_of_text|>' + content }}"
                "{% else %}"
                    "{{ content }}"
                "{% endif %}"
            "{% endfor %}"
        )
        # Also ensure padding token is set when manually setting chat_template
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token: {tokenizer.pad_token}")
    else:
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token: {tokenizer.pad_token}")
    
    print(f"Loading data from: {jsonl_path}")
    raw_dataset = load_dataset("json", data_files=jsonl_path)
    if 'train' not in raw_dataset:
        print("Error: dataset does not contain 'train' split. check jsonl structure")
        return None
    print(f"Dataset loaded with {len(raw_dataset['train'])} samples.")

    format_func_with_tokenizer = partial(format_prompt, tokenizer=tokenizer)

    
    print("Applying Llama 3.1 chat template formatting ...")
    formatted_dataset = raw_dataset.map(
        format_func_with_tokenizer, 
        remove_columns=['system_prompt', 'user_review', 'target_json_output'],
        num_proc=4 # Use multiple processes for faster mapping
    )
    #print(f"DEBUG: Features after formatting step: {formatted_dataset['train'].features}")
    tokenize_func_with_tokenizer = partial(_tokenize_function, tokenizer=tokenizer)

    print("Tokenizing the formatted dataset ...")
    tokenized_dataset = formatted_dataset.map(
        tokenize_func_with_tokenizer, # Use the partial function here
        batched=True,
        remove_columns=["text"],
        num_proc=4
    )
    #print(f"DEBUG: Features after tokenization step: {tokenized_dataset['train'].features}")

    print(f"Splitting dataset into train ({1-test_size:.0%}) and test ({test_size:.0%})...")
    # Perform the train-test split
    split_dataset = tokenized_dataset['train'].train_test_split(test_size=test_size, seed=seed)

    return DatasetDict({
        'train': split_dataset['train'],
        'test': split_dataset['test']
    })


train_filepath = '/kaggle/input/gamereviewtraining8/synthetic_testdata.jsonl'
dataset = create_training_dataset(train_filepath, tokenizer, 0.02)
print("\nDataset successfully created")

if False and dataset:
    print("\nDataset successfully created and split:")
    print(dataset)

    print(f"DEBUG: Length of training dataset: {len(dataset['train'])}")
    print(f"DEBUG: Length of test dataset: {len(dataset['test'])}")

    print("\nFeatures of the training dataset:")
    print(dataset['train'].features)
    print("\nFirst training example:")
    # Decode to verify the actual text that will be fed to the model
    decoded_text_unnested = tokenizer.decode(dataset['train'][0]['input_ids'])
    print(decoded_text_unnested)


print("---------Cell Done")

# Config Model Trainer

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'], 
    #eval_dataset = dataset['test'], 
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    #dataset_num_proc = 2,
    dataset_num_proc = None,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 20,
        num_train_epochs = 1, # (3-5 for 1k) The number of training epochs(0 if the maximum steps are defined) 1
        max_steps = -1,  # The maximum steps (0 if the epochs are defined) 130
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        #output_dir = "outputs",
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 50,
        save_total_limit = 2,
        report_to="none",
    ),
)

print("---------Cell Done")

# Train the model

In [None]:
trainer_stats = trainer.train()

print("---------Cell Done")

# Save to HF (multiple options)

In [None]:
modelname = "YOUR_MODEL_NAME"
hf_username = "YOUR_HF_USERNAME"
hf_username_model = f"{hf_username}/{modelname}"


# Merge then convert to GGUF then PUSH
# Very slow, needs lots of disk space, memory and GPU
# Completely finshes process
if False:
    print(f"Starting GGUF export and push to: {hf_username_modelname")
    model.push_to_hub_gguf(f"{hf_username_model}", tokenizer, quantization_method = "q4_k_m")
    print("done")
   

# push lora - FAST, but merging with base model and distill to GGUF still needed
# use this to quickly save work
if False:
# Use a descriptive name for your ADAPTERS repository to distinguish it from the GGUF model
    adapter_model_name = f"{modelname}-adapters"
    print(f"Pushing ONLY LoRA adapters to Hugging Face Hub: {hf_username_model}")
    # This pushes the small adapter weights, not the large GGUF
    trainer.push_to_hub(f"{hf_username_model}")
    print("done")
    
      

# push 16bit model from hosted directory - still needs to convert to GGUF
# this is case where model is no longer in memory
# (BUILT MODEL ON KAGGLE THEN RAN OUT OF DISK TRYING TO PUSH TO HF)
if False:
    from unsloth import FastLanguageModel
    import torch
    import os
    
    # Assuming your merged model is in this directory
    local_model_directory = "/kaggle/working/" 
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Attempting to load model onto: {device}")
    
    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = local_model_directory,
        max_seq_length = 1024, # Use your original max_seq_length
        dtype = torch.float16, # Auto-detect, or torch.float16 if it was saved as 16bit
        load_in_4bit = False, # If you want to push the full 16-bit model
        device_map = device,
        )

    tokenizer = FastLanguageModel.from_pretrained(local_model_directory).tokenizer
    repo_id = hf_username_model
    model.push_to_hub(repo_id, token=True) # token=True uses the logged-in token
    tokenizer.push_to_hub(repo_id, token=True)
    print(f"Model successfully pushed to: https://huggingface.co/{repo_id}")


# push 16bit model from memory - still needs to convert to GGUF
# this is the case where the model is still in memory
if True:
    model.push_to_hub_merged(f"{hf_username_model}", tokenizer, save_method = "merge_16bit", token=True)
    print(f"Model successfully pushed to: https://huggingface.co/{repo_id}")  


print("---------Cell Done")