# Installation

In [None]:
# %%capture
import os
import warnings

!pip install --upgrade pip

!pip uninstall unsloth torch torchvision torchaudio xformers -y
!pip cache purge

!pip install torch==2.4.* torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# !pip install --no-deps bitsandbytes accelerate peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
# !pip install --no-deps xformers==0.0.28.post3

!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer wandb

#!pip install unsloth
#!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install "unsloth[cu124-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"



# Confirguration

In [None]:
# Set your tokens here
MODEL_NAME = "somosnlp-hackathon-2025/mistral-small-tortuga-galapagos"  # Change to your desired model name
# Login to Wandb
import wandb
wandb.login()

# MODEL SETUP

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = os.environ['HF_TOKEN'],
)


# LORA CONFIGURATION

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
# Load the tokenizer
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


# DATA PREPARATION

In [None]:
import json
from unsloth import standardize_sharegpt
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, # chatml style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = []
    
    for convo in convos:
        try:
            # Parse the JSON string to get the actual conversation list
            if isinstance(convo, str):
                parsed_convo = standardize_sharegpt(json.loads(convo))
            else:
                parsed_convo = convo
            
            if isinstance(parsed_convo, list) and len(parsed_convo) >= 2:
                if len(parsed_convo) == 2:
                    texts.append(tokenizer.apply_chat_template(parsed_convo, tokenize=False, add_generation_prompt=False))
                # else:
                #     chunks = [parsed_convo[i:i+2] for i in range(0, len(parsed_convo), 2)]
                #     for chunk in chunks:
                #         # print("line\n\n")
                #         # print(chunk)
                #         # print("enline\n\n")
                #         texts.append(tokenizer.apply_chat_template(chunk, tokenize=False, add_generation_prompt=False))
        except Exception as e:
            print(f"Error processing conversation {i}: {e}")
    
    return {"text": texts}
pass


# from datasets import load_dataset
# dataset = load_dataset("somosnlp-hackathon-2025/Patrimonio-Gastronomico-Colombiano-Ecuatoriano", split = "train")
# dataset = dataset.map(formatting_prompts_func, batched = True,)

# Load the specific dataset file you requested
from datasets import load_dataset

print("Loading dataset...")
dataset = load_dataset(
    "somosnlp-hackathon-2025/Patrimonio-Gastronomico-Colombiano-Ecuatoriano", 
    data_files="data/train-somosnpl-recetas-zero-v2.parquet",
    split = "train"
)

# First, collect all the processed texts
dataset = dataset.filter(lambda example: example['metadata.type'] != 'multi_turn')

print(f"Dataset loaded with {len(dataset)} examples")
print("Sample data structure:")
print(dataset[5])

# Create a new dataset from the processed texts
# from datasets import Dataset
# dataset = Dataset.from_dict({"text": all_processed_data["text"]})
# dataset = dataset.filter(lambda x: x["text"] is not None and len(x["text"].strip()) > 0)
# dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])
# Apply formatting
dataset = dataset.map(formatting_prompts_func, batched = True)
dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])

# Let's see how the ChatML format works by printing the first element
print("\nFormatted conversation example:")
print(dataset[5])

# Split dataset (90% train, 10% validation)
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# TRAINING CONFIGURATION

>>> Now let's use Huggingface TRL's SFTTrainer! More docs here: TRL SFT docs. We do 60 steps to speed things up, but you can set num_train_epochs=1 for a full run, and turn off max_steps=None. We also support TRL's DPOTrainer!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Initialize Wandb run
wandb.init(
    project="somosnpl",
    name="mistral-7b-tortuga-galapagos",
    config={
        "model": "mistral-7b-instruct",
        "dataset": "Patrimonio-Gastronomico-Colombiano-Ecuatoriano",
        "technique": "QLoRA",
        "max_seq_length": max_seq_length,
        "lora_r": 16,
        "lora_alpha": 16,
    }
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps = 2,
        warmup_steps = 100,
        num_train_epochs = 2,  # Changed from max_steps to full epoch
        # max_steps = 60,  # Commented out to allow full training
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_torch_fused", # adamw_8bit / adamw_torch_fused
        weight_decay = 0.01,
        max_grad_norm=1.0,
        lr_scheduler_type = "cosine", # linear/cosine
        seed = 42,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
        run_name = "mistral-7b-tortuga-galapagos",
        
        # Logging
        logging_steps = 25,
        logging_first_step=True,

        # Evaluation
        eval_strategy = "steps",
        eval_steps=100,
        
        # save_strategy = "epoch",
        # save_total_limit = 2,
        # Saving
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        # Performance
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        ddp_find_unused_parameters=False,
    
        # Data loading
        # dataloader_num_workers=4,
        # remove_unused_columns=False,
    ),
)

### MEMORY STATS (PRE-TRAINING)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# TRAINING

In [None]:
print("Señores y señoras arranca el entrenamiento\nStart training...")
trainer_stats = trainer.train()

# MEMORY STATS (POST-TRAINING)

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Log final stats to Wandb
wandb.log({
    "training_time_minutes": round(trainer_stats.metrics['train_runtime']/60, 2),
    "peak_memory_gb": used_memory,
    "training_memory_gb": used_memory_for_lora,
    "memory_usage_percent": used_percentage,
})


# INFERENCE TEST
Let's run the model! Since we're using ChatML, use apply_chat_template with add_generation_prompt set to True for inference.

In [1]:
print("\n" + "="*50)
print("TESTING INFERENCE")
print("="*50)

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, # chatml style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Test with a food-related question in Spanish
messages = [
    {"role": "user", "content": "¿Cuáles son los ingredientes principales del encebollado ecuatoriano?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

# Use a TextStreamer for continuous inference
# so you can see the generation token by token
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
print("Model response:")
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 256, use_cache = True)

# waiting the whole time!
# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)


TESTING INFERENCE


ModuleNotFoundError: No module named 'unsloth'

# SAVE TO HUGGING FACE - fine-tuning model

To save the final model as LoRA adapters, either use Huggingface's push_to_hub for an online save or save_pretrained for a local save.

[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
print("\n" + "="*50)
print("SAVING MODELS")
print("="*50)

# Save the model and tokenizer locally
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")



# Test the local lora model
Now if you want to load the LoRA adapters we just saved for inference, set False to True:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "¿Cuáles son los ingredientes principales del Motepillo?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 4096, use_cache = True)

You can also use Hugging Face's AutoModelForPeftCausalLM. Only use this if you do not have unsloth installed. It can be hopelessly slow, since 4bit model downloading is not supported, and Unsloth's inference is 2x faster.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer

    model = AutoModelForPeftCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

# Saving to float16 for VLLM

We also support saving to float16 directly. Select merged_16bit for float16 or merged_4bit for int4. We also allow lora adapters as a fallback. Use push_to_hub_merged to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
print("Saving merged model in float16...")
if False: model.save_pretrained_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas", tokenizer, save_method = "merged_16bit", token = os.environ['HF_TOKEN'])
print(f"✅ Float16 model saved to: {MODEL_NAME}")

# Merge to 4bit
if False: model.save_pretrained_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-4bit", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-4bit", tokenizer, save_method = "merged_4bit", token = os.environ['HF_TOKEN'])

# Just LoRA adapters
print("Saving LoRA adapters...")
if False: model.save_pretrained_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-LoRA", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-LoRA", tokenizer, save_method = "lora", token = os.environ['HF_TOKEN'])
print(f"✅ LoRA adapters saved to: {MODEL_NAME}-lora")

# GGUF / llama.cpp Conversion

To save to GGUF / llama.cpp, we support it natively now! We clone llama.cpp and we default save it to q8_0. We allow all methods like q4_k_m. Use save_pretrained_gguf for local saving and push_to_hub_gguf for uploading to HF.

Some supported quant methods (full list on our Wiki page):

    q8_0 - Fast conversion. High resource use, but generally acceptable.
    q4_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
    q5_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.


In [None]:
# Save to 16bit GGUF
if True: model.save_pretrained_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "f16", token = os.environ['HF_TOKEN'])

# Save to 8bit Q8_0
if False: model.save_pretrained_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer,)
if False: model.push_to_hub_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, token = os.environ['HF_TOKEN'])

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "q4_k_m", token = os.environ['HF_TOKEN'])