# Installation

In [None]:
%%capture
import os
import warnings

!pip install --upgrade pip
!pip uninstall unsloth xformers torch torchvision torchaudio -y
!pip cache purge

!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124

!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer wandb

!pip install transformers==4.51.3 xformers==0.0.28.post1

!pip install unsloth[cu124-ampere-torch240]



# Confirguration

In [None]:
%%capture
# Set your tokens here
DATASET_REPO = "somosnlp-hackathon-2025/gastronomia-hispana-dpo"
MODEL_BASE = "unsloth/Qwen3-8B-unsloth-bnb-4bit"
MODEL_NAME = "Qwen3-8B-gastronomia-hispana-dpo"
MODEL_REPO = "somosnlp-hackathon-2025/Qwen3-8B-gastronomia-hispana-dpo"

# Login to Wandb
import wandb
wandb.login()

# DPO Trainer patch

In [None]:
# %%capture
# One must patch the DPO Trainer first!
# import unsloth
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

# from transformers import modeling_utils
# if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
#     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none","colwise",'rowwise']

# MODEL SETUP

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2500 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_BASE, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit=True,
    load_in_8bit=False,
    token = os.environ['HF_TOKEN'],
)


# DATA PREPARATION

In [None]:
import json
from unsloth import standardize_sharegpt
from unsloth.chat_templates import get_chat_template

# Configure tokenizer with ChatML template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, # chatml style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

def formatting_dpo_prompts_func(examples):
    """
    Standard DPO format: prompt contains system+user, chosen/rejected contain only assistant responses
    """
    chosen_conversations = examples["chosen"]
    rejected_conversations = examples["rejected"]
    
    prompts = []
    chosen_texts = []
    rejected_texts = []
    
    print(f"Processing {len(chosen_conversations)} examples")
    
    for i, (chosen_conv, rejected_conv) in enumerate(zip(chosen_conversations, rejected_conversations)):
        try:
            # Debug: Print first few examples
            # if i < 3:
            #     print(f"\nExample {i}:")
            #     print(f"Chosen conversation type: {type(chosen_conv)}")
            #     print(f"Chosen conversation length: {len(chosen_conv)}")
            #     print(f"First message: {chosen_conv[0] if len(chosen_conv) > 0 else 'Empty'}")
            #     print(f"Last message: {chosen_conv[-1] if len(chosen_conv) > 0 else 'Empty'}")
            
            # Validate conversations
            if not chosen_conv or not rejected_conv:
                print(f"Warning: Empty conversation at index {i}")
                prompts.append("")
                chosen_texts.append("")
                rejected_texts.append("")
                continue
                
            if len(chosen_conv) < 2 or len(rejected_conv) < 2:
                print(f"Warning: Too short conversation at index {i}")
                prompts.append("")
                chosen_texts.append("")
                rejected_texts.append("")
                continue
            
            # Extract prompt (system + user messages) - should be same for both chosen/rejected
            # Take all messages except the last assistant response
            prompt_messages = chosen_conv[:-1]  # All except last assistant message
            
            # Validate that the last message is from assistant
            if chosen_conv[-1].get("role") != "assistant" or rejected_conv[-1].get("role") != "assistant":
                print(f"Warning: Last message is not from assistant at index {i}")
                prompts.append("")
                chosen_texts.append("")
                rejected_texts.append("")
                continue
            
            # Extract just the assistant responses
            chosen_assistant_msg = chosen_conv[-1]["content"]  # Last message content
            rejected_assistant_msg = rejected_conv[-1]["content"]  # Last message content
            
            # Format prompt (system + user with generation prompt)
            prompt_text = tokenizer.apply_chat_template(
                prompt_messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            
            prompts.append(prompt_text)
            chosen_texts.append(chosen_assistant_msg)
            rejected_texts.append(rejected_assistant_msg)
            
        except Exception as e:
            print(f"Error processing conversation {i}: {e}")
            print(f"Chosen conv structure: {chosen_conv}")
            print(f"Rejected conv structure: {rejected_conv}")
            # Add empty strings to maintain list length consistency
            prompts.append("")
            chosen_texts.append("")
            rejected_texts.append("")
    
    print(f"\nSuccessfully processed:")
    print(f"- Prompts: {len([p for p in prompts if p])}")
    print(f"- Chosen: {len([c for c in chosen_texts if c])}")
    print(f"- Rejected: {len([r for r in rejected_texts if r])}")
    
    return {
        "prompt": prompts,
        "chosen": chosen_texts, 
        "rejected": rejected_texts
    }

# Load the dataset
from datasets import load_dataset

print("Loading dataset...")
dataset = load_dataset(
    DATASET_REPO, 
    split="train"
)

print(f"Dataset loaded with {len(dataset)} examples")
print("Sample data structure:")
print("Keys:", dataset.column_names)
print("\nSample chosen conversation:")
print(dataset[0]["chosen"])

# Formatting approach:

# DPO formatting (for preference training)
print("\n=== DPO FORMATTING ===")
dpo_dataset = dataset.map(formatting_dpo_prompts_func, batched=True)
# Remove original columns
columns_to_remove = [col for col in dataset.column_names if col not in ["prompt", "chosen", "rejected"]]
dpo_dataset = dpo_dataset.remove_columns(columns_to_remove)

print("DPO formatted example:")
print("Prompt:", dpo_dataset[0]["prompt"][:200] + "...")
print("Chosen:", dpo_dataset[0]["chosen"][:200] + "...")
print("Rejected:", dpo_dataset[0]["rejected"][:200] + "...")

# Split datasets
dpo_split = dpo_dataset.train_test_split(test_size=0.05, seed=42)

print(f"\nDataset splits created:")
print(f"DPO - Train: {len(dpo_split['train'])}, Eval: {len(dpo_split['test'])}")



# LORA CONFIGURATION

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
# Load the tokenizer
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "lm_head"],
    lora_alpha = 64,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# DPO TRAINING CONFIGURATION

Now let's use Huggingface TRL's DPOTrainer! More docs here: TRL DPO docs. We do 3 epochs on 0.5% of the dataset to speed things up.

In [None]:
# One must patch the DPO Trainer first!
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

# Initialize Wandb run
wandb.init(
    project="somosnpl",
    name=MODEL_NAME,
    config={
        "model": MODEL_BASE,
        "dataset": DATASET_REPO,
        "technique": "DPO",
        "max_length": max_seq_length,
        "lora_r": 64,
        "lora_alpha": 64,
        "learning_rate": 5e-7,
        "num_epochs": 1.5,
        "beta": 0.5,
    }
)

trainer = DPOTrainer(
    model = model,
    ref_model = None,
    tokenizer = tokenizer,
    beta = 0.8,
    train_dataset=dpo_split['train'],
    eval_dataset=dpo_split['test'],
    max_length = 2500,
    max_prompt_length = 350,
    dataset_num_proc = 4,
    # resume_from_checkpoint="outputs/checkpoint-200",
    args = DPOConfig(
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_ratio = 0.03,
        num_train_epochs = 1,
        learning_rate = 2e-7,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit", # adamw_8bit / adamw_torch_fused
        weight_decay = 0.1,
        max_grad_norm=0.5,
        lr_scheduler_type = "cosine", # linear/cosine
        seed = 42,
        output_dir = "outputs",
        logging_dir = "outputs/logs",
        report_to = "wandb", # Use this for WandB etc
        run_name = MODEL_NAME,
        
        # Logging
        logging_steps = 20,
        logging_first_step=True,

        # Evaluation
        eval_strategy = "steps",
        eval_steps=50,
        
        # Saving
        save_strategy="steps",
        save_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,

        # Performance
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        ddp_find_unused_parameters=False,
    
        # Data loading
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        remove_unused_columns=True,
        dataloader_drop_last = True,  # Consistent batch sizes
        prediction_loss_only = False,  # Reduce eval memory usage
        
    ),
)


### MEMORY STATS (PRE-TRAINING)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# TRAINING

In [None]:
print("Start training...")
trainer_stats = trainer.train()

In [None]:
# latest_checkpoint='./outputs/checkpoint-210'
# trainer_stats = trainer.train(resume_from_checkpoint=latest_checkpoint)

# MEMORY STATS (POST-TRAINING)

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Log final stats to Wandb
wandb.log({
    "training_time_minutes": round(trainer_stats.metrics['train_runtime']/60, 2),
    "peak_memory_gb": used_memory,
    "training_memory_gb": used_memory_for_lora,
    "memory_usage_percent": used_percentage,
})


# INFERENCE TEST
Let's run the model! Since we're using ChatML, use apply_chat_template with add_generation_prompt set to True for inference.

In [None]:
# # First, apply the compatibility fix that was in your original notebook
# from transformers import modeling_utils
# if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
#     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"]


print("\n" + "="*50)
print("TESTING INFERENCE")
print("="*50)

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"}, # chatml style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Test with a food-related question in Spanish
messages = [
    # {"role": "system","content": "Eres un maestro culinario especializado en técnicas de cocción internacionales, con expertise en tiempos, temperaturas y métodos tradicionales de diversas culturas gastronómicas. responde siempre en español \no-think"},
    {"role": "user", "content": "¿Podrías explicarme paso a paso cómo preparar encebollado ecuatorianos? \no-think"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

# Use a TextStreamer for continuous inference
# so you can see the generation token by token
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
print("Model response:")
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2048, use_cache = True)

# waiting the whole time!
# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)

# SAVE TO HUGGING FACE - fine-tuning model

To save the final model as LoRA adapters, either use Huggingface's push_to_hub for an online save or save_pretrained for a local save.

[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
print("\n" + "="*50)
print("SAVING MODELS")
print("="*50)

# Save the model and tokenizer locally
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")



# Test the local lora model
Now if you want to load the LoRA adapters we just saved for inference, set False to True:

In [None]:
max_seq_length = 2500 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "./lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "¿Cuáles son los ingredientes principales del Motepillo?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 4096, use_cache = True)

You can also use Hugging Face's AutoModelForPeftCausalLM. Only use this if you do not have unsloth installed. It can be hopelessly slow, since 4bit model downloading is not supported, and Unsloth's inference is 2x faster.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer

    model = AutoModelForPeftCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

# Saving to float16 for VLLM

We also support saving to float16 directly. Select merged_16bit for float16 or merged_4bit for int4. We also allow lora adapters as a fallback. Use push_to_hub_merged to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
## Solution 1: Comprehensive DTensor Patch

# Add this code before trying to save the merged model
def patch_dtensor_comprehensive():
    """Comprehensive patch for DTensor issues in transformers"""
    try:
        import torch
        from transformers import pytorch_utils, modeling_utils
        import types
        
        # Create a dummy DTensor class
        class DummyDTensor:
            def __init__(self, *args, **kwargs):
                pass
            def to_local(self):
                return self
        
        # Ensure torch.distributed.tensor module exists
        if not hasattr(torch.distributed, 'tensor'):
            torch.distributed.tensor = types.ModuleType('tensor')
        
        # Add DTensor to the module
        torch.distributed.tensor.DTensor = DummyDTensor
        
        # Patch id_tensor_storage function
        original_id_tensor_storage = pytorch_utils.id_tensor_storage
        
        def patched_id_tensor_storage(tensor):
            try:
                return original_id_tensor_storage(tensor)
            except (ImportError, NameError):
                if hasattr(tensor, 'data_ptr'):
                    return tensor.data_ptr()
                else:
                    return id(tensor)
        
        pytorch_utils.id_tensor_storage = patched_id_tensor_storage
        
        # Patch the save_pretrained method to handle DTensor references
        if hasattr(modeling_utils, 'PreTrainedModel'):
            original_save_pretrained = modeling_utils.PreTrainedModel.save_pretrained
            
            def patched_save_pretrained(self, *args, **kwargs):
                # Add DTensor to globals temporarily
                old_globals = modeling_utils.__dict__.copy()
                modeling_utils.DTensor = DummyDTensor
                
                try:
                    return original_save_pretrained(self, *args, **kwargs)
                finally:
                    # Restore original globals
                    modeling_utils.__dict__.clear()
                    modeling_utils.__dict__.update(old_globals)
            
            modeling_utils.PreTrainedModel.save_pretrained = patched_save_pretrained
        
        # Also patch any other modules that might reference DTensor
        import transformers
        if not hasattr(transformers, 'DTensor'):
            transformers.DTensor = DummyDTensor
        
        print("✅ Comprehensive DTensor patch applied successfully")
        
    except Exception as e:
        print(f"⚠️ Comprehensive patching failed: {e}")
        import traceback
        traceback.print_exc()

# Apply the comprehensive patch
# patch_dtensor_comprehensive()

# Merge to 16bit
if False:
    print("Saving merged model in float16...")
    model.save_pretrained_merged(f"{MODEL_REPO}", tokenizer, save_method = "merged_16bit",)
    model.push_to_hub_merged(f"{MODEL_REPO}", tokenizer, save_method = "merged_16bit", token = os.environ['HF_TOKEN'])
    print(f"✅ Float16 model saved to: {MODEL_REPO}")

# Merge to 4bit
if True:
    print("Saving merged model in 4bit...")
    model.save_pretrained_merged(f"{MODEL_REPO}-4bit", tokenizer, save_method = "merged_4bit_forced",)
    model.push_to_hub_merged(f"{MODEL_REPO}-4bit", tokenizer, save_method = "merged_4bit_forced", token = os.environ['HF_TOKEN'])
    print(f"✅ Float16 model saved to: {MODEL_REPO}")

# Just LoRA adapters
if False:
    print("Saving LoRA adapters...")
    model.save_pretrained_merged(f"{MODEL_REPO}-LoRA", tokenizer, save_method = "lora",)
    model.push_to_hub_merged(f"{MODEL_REPO}-LoRA", tokenizer, save_method = "lora", token = os.environ['HF_TOKEN'])
    print(f"✅ LoRA adapters saved to: {MODEL_REPO}-lora")

# GGUF / llama.cpp Conversion

To save to GGUF / llama.cpp, we support it natively now! We clone llama.cpp and we default save it to q8_0. We allow all methods like q4_k_m. Use save_pretrained_gguf for local saving and push_to_hub_gguf for uploading to HF.

Some supported quant methods (full list on our Wiki page):

    q8_0 - Fast conversion. High resource use, but generally acceptable.
    q4_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
    q5_k_m - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.


In [None]:
# Save to 16bit GGUF
if True: model.save_pretrained_gguf(f"{MODEL_REPO}-GGUF", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf(f"{MODEL_REPO}-GGUF", tokenizer, quantization_method = "f16", token = os.environ['HF_TOKEN'])

# Save to 8bit Q8_0
if False: model.save_pretrained_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer,)
if False: model.push_to_hub_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, token = os.environ['HF_TOKEN'])

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf(f"somosnlp-hackathon-2025/mistral-7B-ec-es-recetas-GGUF", tokenizer, quantization_method = "q4_k_m", token = os.environ['HF_TOKEN'])

In [None]:
import os
import torch
import shutil
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# Configuration (adjust these to match your setup)
MODEL_BASE = "unsloth/Qwen3-8B-unsloth-bnb-4bit"
MODEL_REPO = "somosnlp-hackathon-2025/Qwen3-8B-gastronomia-hispana-dpo"
LORA_PATH = "./lora_model"

def check_disk_space():
    """Check available disk space"""
    total, used, free = shutil.disk_usage("/data")
    print(f"💾 Disk Space - Total: {total//1024**3}GB, Used: {used//1024**3}GB, Free: {free//1024**3}GB")
    return free // 1024**3  # Return free space in GB

def comprehensive_dtensor_patch():
    """Fix DTensor issues for model saving"""
    try:
        import torch
        from transformers import pytorch_utils, modeling_utils
        import types
        
        # Create dummy DTensor class
        class DummyDTensor:
            def __init__(self, *args, **kwargs):
                pass
            def to_local(self):
                return self
        
        # Ensure torch.distributed.tensor module exists
        if not hasattr(torch.distributed, 'tensor'):
            torch.distributed.tensor = types.ModuleType('tensor')
        
        torch.distributed.tensor.DTensor = DummyDTensor
        
        # Patch id_tensor_storage function
        original_id_tensor_storage = pytorch_utils.id_tensor_storage
        
        def patched_id_tensor_storage(tensor):
            try:
                return original_id_tensor_storage(tensor)
            except (ImportError, NameError):
                if hasattr(tensor, 'data_ptr'):
                    return tensor.data_ptr()
                else:
                    return id(tensor)
        
        pytorch_utils.id_tensor_storage = patched_id_tensor_storage
        
        # Patch save_pretrained method
        if hasattr(modeling_utils, 'PreTrainedModel'):
            original_save_pretrained = modeling_utils.PreTrainedModel.save_pretrained
            
            def patched_save_pretrained(self, *args, **kwargs):
                old_globals = modeling_utils.__dict__.copy()
                modeling_utils.DTensor = DummyDTensor
                
                try:
                    return original_save_pretrained(self, *args, **kwargs)
                finally:
                    modeling_utils.__dict__.clear()
                    modeling_utils.__dict__.update(old_globals)
            
            modeling_utils.PreTrainedModel.save_pretrained = patched_save_pretrained
        
        import transformers
        if not hasattr(transformers, 'DTensor'):
            transformers.DTensor = DummyDTensor
        
        print("✅ DTensor patch applied successfully")
        
    except Exception as e:
        print(f"⚠️ DTensor patching failed: {e}")


print("🎯 BFloat-16 Model Creation with Space Optimization")
print("=" * 60)
    

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=LORA_PATH,
    max_seq_length=2500,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # Load in 4bit to save memory
)

# Configure tokenizer
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
    map_eos_token=True,
)

print("✅ Model loaded successfully")
# check_disk_space()

print("\n💾 Creating BFloat-16 merged model...")

# Option 1: Save locally first (if you have space)
# local_save = True
# if local_save and free_space >= 18:
#     print("Saving locally first...")
#     model.save_pretrained_merged(
#         f"{MODEL_REPO}-local", 
#         tokenizer, 
#         save_method="merged_16bit"
#     )
#     print("✅ Local BFloat-16 model saved")
#     check_disk_space()

# Option 2: Push directly to HuggingFace (recommended for space saving)
print("\n🚀 Uploading to HuggingFace...")
model.push_to_hub_merged(
    MODEL_REPO, 
    tokenizer, 
    save_method="merged_16bit", 
    token=os.environ.get('HF_TOKEN')
)