In [None]:
import torch
import gc
from pathlib import Path
from datetime import datetime
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training


class Config:
    
    TRAIN_FILE = "./hybrid_training_data/train.jsonl"
    VAL_FILE = "./hybrid_training_data/validation.jsonl"
    OUTPUT_DIR = "./phi3_hybrid_model"
    
    
    MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
    
    
    NUM_EPOCHS = 3  
    BATCH_SIZE = 2  
    GRAD_ACCUM_STEPS = 8  
    MAX_SEQ_LENGTH = 512
    LEARNING_RATE = 3e-5  
    
    
    LORA_R = 8 
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.1
    
    
    EARLY_STOPPING_PATIENCE = 2
    
    
    GPU_MEMORY = "4.5GB"

config = Config()

def cleanup():
    
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def setup_model_and_tokenizer():
    
    print("Cargando Phi-3-Mini...")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
    
    
    tokenizer = AutoTokenizer.from_pretrained(
        config.MODEL_NAME,
        trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    
    model = AutoModelForCausalLM.from_pretrained(
        config.MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )
    
    model.config.use_cache = False
    
    print(" Modelo cargado")
    print(f"VRAM usada: {torch.cuda.memory_allocated(0) / 1024**3:.2f}GB")
    
    return model, tokenizer

def prepare_lora(model):
    """Configura LoRA"""
    print("\nPreparando LoRA...")
    
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
    
    lora_config = LoraConfig(
        r=config.LORA_R,
        lora_alpha=config.LORA_ALPHA,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                       "gate_proj", "up_proj", "down_proj"],
        lora_dropout=config.LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    from peft import get_peft_model
    model = get_peft_model(model, lora_config)
    
    print(" LoRA configurado")
    model.print_trainable_parameters()
    
    return model

def formatting_func(example):
    
    messages = example['messages']
    
    
    if isinstance(messages[0], list):
        
        outputs = []
        for msg_list in messages:
            system_msg = ""
            user_msg = ""
            assistant_msg = ""
            
            for msg in msg_list:
                if msg['role'] == 'system':
                    system_msg = msg['content']
                elif msg['role'] == 'user':
                    user_msg = msg['content']
                elif msg['role'] == 'assistant':
                    assistant_msg = msg['content']
            
            text = f"<|system|>\n{system_msg}<|end|>\n<|user|>\n{user_msg}<|end|>\n<|assistant|>\n{assistant_msg}<|end|>"
            outputs.append(text)
        
        return outputs
    else:
        
        system_msg = ""
        user_msg = ""
        assistant_msg = ""
        
        for msg in messages:
            if msg['role'] == 'system':
                system_msg = msg['content']
            elif msg['role'] == 'user':
                user_msg = msg['content']
            elif msg['role'] == 'assistant':
                assistant_msg = msg['content']
        
        text = f"<|system|>\n{system_msg}<|end|>\n<|user|>\n{user_msg}<|end|>\n<|assistant|>\n{assistant_msg}<|end|>"
        
        return [text]

def main():
    print("="*70)
    print("FINE-TUNING HÍBRIDO PHI-3-MINI")
    print("="*70)
    
    
    cleanup()
    
   
    if not Path(config.TRAIN_FILE).exists():
        print(f" No se encuentra: {config.TRAIN_FILE}")
        print("Ejecuta primero generate_hybrid_dataset.py")
        return
    
    
    print("\nCargando datasets...")
    train_dataset = load_dataset('json', data_files=config.TRAIN_FILE, split='train')
    val_dataset = load_dataset('json', data_files=config.VAL_FILE, split='train')
    
    print(f" Train: {len(train_dataset)} ejemplos")
    print(f" Val: {len(val_dataset)} ejemplos")
    
   
    model, tokenizer = setup_model_and_tokenizer()
    model = prepare_lora(model)
    
    
    print("\nConfigurando entrenamiento...")
    
    training_args = TrainingArguments(
        output_dir=config.OUTPUT_DIR,
        num_train_epochs=config.NUM_EPOCHS,
        per_device_train_batch_size=config.BATCH_SIZE,
        per_device_eval_batch_size=config.BATCH_SIZE,
        gradient_accumulation_steps=config.GRAD_ACCUM_STEPS,
        gradient_checkpointing=True,
        optim="paged_adamw_8bit",
        learning_rate=config.LEARNING_RATE,
        weight_decay=0.01,
        max_grad_norm=0.3,
        bf16=True,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        
        
        logging_steps=10,
        logging_dir=f"{config.OUTPUT_DIR}/logs",
        logging_first_step=True,
        
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        
        eval_strategy="steps",
        eval_steps=100,
        
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
        remove_unused_columns=True,  
        
        seed=42,
        report_to="none"
    )
    
    print(" Training args configurados")
    
    
    print("\nCreando SFTTrainer...")
    
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        formatting_func=formatting_func,
        max_seq_length=config.MAX_SEQ_LENGTH,
        tokenizer=tokenizer,
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=config.EARLY_STOPPING_PATIENCE,
                early_stopping_threshold=0.001
            )
        ]
    )
    
    print(" Trainer creado")
    print(f"VRAM total: {torch.cuda.memory_allocated(0) / 1024**3:.2f}GB\n")
    
    
    print("="*70)
    print("INICIANDO ENTRENAMIENTO")
    print("="*70)
    print(f"Épocas: {config.NUM_EPOCHS}")
    print(f"Batch efectivo: {config.BATCH_SIZE * config.GRAD_ACCUM_STEPS}")
    print(f"Learning rate: {config.LEARNING_RATE}")
    print(f"Early stopping patience: {config.EARLY_STOPPING_PATIENCE}")
    print("="*70 + "\n")
    
    try:
        trainer.train()
        
        print("\n" + "="*70)
        print("ENTRENAMIENTO COMPLETADO")
        print("="*70)
        
        
        output_path = Path(config.OUTPUT_DIR) / "final_adapter"
        trainer.model.save_pretrained(output_path)
        tokenizer.save_pretrained(output_path)
        
        print(f"\n Modelo guardado en: {output_path}")
        print(f"VRAM final: {torch.cuda.memory_allocated(0) / 1024**3:.2f}GB")
        
        
        print(f"\n Estadísticas:")
        print(f"   Épocas completadas: {config.NUM_EPOCHS}")
        print(f"   Ejemplos train: {len(train_dataset)}")
        print(f"   Ejemplos val: {len(val_dataset)}")
        
        print(f"\n💡 SIGUIENTE PASO:")
        print(f"   Ejecuta inference_hybrid.py para probar el sistema completo")
        
    except KeyboardInterrupt:
        print("\n Entrenamiento interrumpido")
        print("Guardando checkpoint de emergencia...")
        trainer.save_model(f"{config.OUTPUT_DIR}/emergency_checkpoint")
        print(" Checkpoint guardado")
    
    finally:
        cleanup()
        print("\n Limpieza completada")

if __name__ == "__main__":
    main()


FINE-TUNING HÍBRIDO PHI-3-MINI

Cargando datasets...
✅ Train: 1109 ejemplos
✅ Val: 196 ejemplos
Cargando Phi-3-Mini...


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Modelo cargado
VRAM usada: 2.11GB

Preparando LoRA...
✅ LoRA configurado
trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.1165

Configurando entrenamiento...
✅ Training args configurados

Creando SFTTrainer...



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/196 [00:00<?, ? examples/s]

  super().__init__(


✅ Trainer creado
VRAM total: 2.49GB

INICIANDO ENTRENAMIENTO
Épocas: 3
Batch efectivo: 16
Learning rate: 3e-05
Early stopping patience: 2



  0%|          | 0/207 [00:00<?, ?it/s]

You are not running the flash-attention implementation, expect numerical differences.


{'loss': 18.4013, 'grad_norm': 3.1231167316436768, 'learning_rate': 4.2857142857142855e-06, 'epoch': 0.01}
{'loss': 18.909, 'grad_norm': 4.985228061676025, 'learning_rate': 2.9983348124429553e-05, 'epoch': 0.14}
{'loss': 18.1195, 'grad_norm': 5.14055871963501, 'learning_rate': 2.9688342159326487e-05, 'epoch': 0.29}
{'loss': 16.8247, 'grad_norm': 5.305100440979004, 'learning_rate': 2.903166046244801e-05, 'epoch': 0.43}
{'loss': 15.5561, 'grad_norm': 5.1938018798828125, 'learning_rate': 2.8029472716572872e-05, 'epoch': 0.58}
{'loss': 14.3555, 'grad_norm': 6.346128463745117, 'learning_rate': 2.6706456110074946e-05, 'epoch': 0.72}
{'loss': 13.2789, 'grad_norm': 6.525103569030762, 'learning_rate': 2.50951877026466e-05, 'epoch': 0.86}
{'loss': 11.7022, 'grad_norm': 7.763555526733398, 'learning_rate': 2.3235342269971978e-05, 'epoch': 1.01}
{'loss': 10.2984, 'grad_norm': 11.638710975646973, 'learning_rate': 2.1172715379076635e-05, 'epoch': 1.15}
{'loss': 9.4233, 'grad_norm': 6.23150110244751, 

  0%|          | 0/98 [00:00<?, ?it/s]

{'eval_loss': 1.0598777532577515, 'eval_runtime': 89.3787, 'eval_samples_per_second': 2.193, 'eval_steps_per_second': 1.096, 'epoch': 1.44}




{'loss': 8.3283, 'grad_norm': 3.3616693019866943, 'learning_rate': 1.4293403239355362e-05, 'epoch': 1.59}
{'loss': 8.0329, 'grad_norm': 3.8594794273376465, 'learning_rate': 1.1958190569652318e-05, 'epoch': 1.73}
{'loss': 7.8571, 'grad_norm': 3.680961847305298, 'learning_rate': 9.697877343311145e-06, 'epoch': 1.87}
{'loss': 7.6637, 'grad_norm': 3.1424615383148193, 'learning_rate': 7.568119973513886e-06, 'epoch': 2.02}
{'loss': 7.2101, 'grad_norm': 3.3247694969177246, 'learning_rate': 5.621360154964428e-06, 'epoch': 2.16}
{'loss': 7.467, 'grad_norm': 3.82478404045105, 'learning_rate': 3.905533575320855e-06, 'epoch': 2.31}
{'loss': 7.1354, 'grad_norm': 3.0702810287475586, 'learning_rate': 2.4628895794759493e-06, 'epoch': 2.45}
{'loss': 7.1796, 'grad_norm': 3.184539556503296, 'learning_rate': 1.3289508504683206e-06, 'epoch': 2.59}
{'loss': 7.1519, 'grad_norm': 3.030181646347046, 'learning_rate': 5.316387231330288e-07, 'epoch': 2.74}
{'loss': 7.0042, 'grad_norm': 3.0833263397216797, 'learni

  0%|          | 0/98 [00:00<?, ?it/s]

{'eval_loss': 0.8897932171821594, 'eval_runtime': 90.0382, 'eval_samples_per_second': 2.177, 'eval_steps_per_second': 1.088, 'epoch': 2.88}
{'train_runtime': 4493.5151, 'train_samples_per_second': 0.74, 'train_steps_per_second': 0.046, 'train_loss': 10.473551542862602, 'epoch': 2.98}

ENTRENAMIENTO COMPLETADO

✅ Modelo guardado en: phi3_hybrid_model\final_adapter
VRAM final: 2.52GB

📊 Estadísticas:
   Épocas completadas: 3
   Ejemplos train: 1109
   Ejemplos val: 196

💡 SIGUIENTE PASO:
   Ejecuta inference_hybrid.py para probar el sistema completo

✅ Limpieza completada
