# Fine-tuning Maestro-REMI con Condicionamiento VA (Run Limpio)

**Objetivo**: Entrenamiento completo desde cero para obtener logs completos y métricas limpias.  
**Resistente a desconexiones**: Checkpoints se guardan en Google Drive y el runtime se acumula entre sesiones.

**Modelo base**: `Natooz/Maestro-REMI-bpe20k`  
**Dataset**: Lakh Piano subset con etiquetas VA heurísticas  
**Output en Drive**: `TFM/finetune_checkpoints/` + `training_log_history.json`

### Instrucciones si Colab se desconecta:
1. Reconectar y ejecutar Celdas 1, 2, 3 (montar, copiar, instalar)
2. Ejecutar Celda 4 → **auto-detecta checkpoint y reanuda** (el runtime total se acumula)

In [None]:
# Celda 1: Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Celda 2: Copiar y descomprimir finetune_bundle
%cd /content
!cp "/content/drive/MyDrive/TFM/finetune_bundle.tar.gz" .
!tar -xzf finetune_bundle.tar.gz
%cd /content/finetune_bundle
!ls -la
!ls -la data/finetune_dataset

In [None]:
# Celda 3: Instalar dependencias
!pip install -r requirements.txt

In [None]:
# Celda 4: Fine-tuning con checkpoints en Drive (resistente a desconexiones)
# - output_dir apunta a Drive → checkpoints persisten si Colab se cae
# - Auto-detecta checkpoint previo y reanuda automáticamente
# - Runtime se acumula entre sesiones en cumulative_runtime.json
# - save_steps=1500 (~1 por época) para evitar freezes por I/O a Drive
import subprocess, os, sys

DRIVE_CKPT = "/content/drive/MyDrive/TFM/finetune_checkpoints"
os.makedirs(DRIVE_CKPT, exist_ok=True)

# Construir comando
cmd = [
    sys.executable, "train_maestro_finetune.py",
    "--model_name", "Natooz/Maestro-REMI-bpe20k",
    "--dataset_dir", "data/finetune_dataset",
    "--output_dir", DRIVE_CKPT,
    "--num_train_epochs", "5",
    "--per_device_train_batch_size", "4",
    "--per_device_eval_batch_size", "4",
    "--gradient_accumulation_steps", "2",
    "--learning_rate", "5e-5",
    "--warmup_ratio", "0.05",
    "--logging_steps", "25",
    "--eval_steps", "500",
    "--save_steps", "1500",
    "--save_total_limit", "2",
    "--fp16",
    "--gradient_checkpointing",
    "--seed", "42",
]

# Auto-detectar checkpoint previo para reanudar
checkpoints = sorted([
    d for d in os.listdir(DRIVE_CKPT)
    if d.startswith("checkpoint-")
]) if os.path.exists(DRIVE_CKPT) else []

if checkpoints:
    last_ckpt = os.path.join(DRIVE_CKPT, checkpoints[-1])
    cmd += ["--resume_from_checkpoint", last_ckpt]
    # Mostrar runtime acumulado previo
    rt_path = os.path.join(DRIVE_CKPT, "cumulative_runtime.json")
    prev_min = 0
    if os.path.exists(rt_path):
        import json
        with open(rt_path) as f:
            prev_min = json.load(f).get("total_runtime_sec", 0) / 60
    print(f"{'='*60}")
    print(f"REANUDANDO desde: {checkpoints[-1]}")
    print(f"Runtime acumulado previo: {prev_min:.1f} min")
    print(f"{'='*60}\n")
else:
    print(f"{'='*60}")
    print("INICIANDO entrenamiento DESDE CERO")
    print(f"{'='*60}\n")

# Ejecutar con output en tiempo real
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
for line in process.stdout:
    print(line, end="")
process.wait()

if process.returncode != 0:
    print(f"\nERROR: Proceso terminó con código {process.returncode}")
else:
    print(f"\nEntrenamiento completado exitosamente")

In [None]:
# Celda 5: Exportar modelo final desde checkpoints de Drive
import shutil, os, json

DRIVE_CKPT = "/content/drive/MyDrive/TFM/finetune_checkpoints"
export_dir = "/content/drive/MyDrive/TFM/finetune_results_clean"
os.makedirs(export_dir, exist_ok=True)

# Copiar modelo final
final_dir = os.path.join(DRIVE_CKPT, "final")
if os.path.exists(final_dir):
    shutil.copytree(final_dir, os.path.join(export_dir, "final"), dirs_exist_ok=True)
    print(f"Modelo final copiado a {export_dir}/final")
else:
    # Usar el último checkpoint como fallback
    checkpoints = sorted([d for d in os.listdir(DRIVE_CKPT) if d.startswith("checkpoint-")])
    if checkpoints:
        last = os.path.join(DRIVE_CKPT, checkpoints[-1])
        shutil.copytree(last, os.path.join(export_dir, "final"), dirs_exist_ok=True)
        print(f"Último checkpoint ({checkpoints[-1]}) copiado como modelo final")

# Copiar logs de entrenamiento
for fname in ["training_summary.json", "training_log_history.json", "cumulative_runtime.json"]:
    for search_dir in [DRIVE_CKPT, final_dir if os.path.exists(final_dir) else ""]:
        src = os.path.join(search_dir, fname) if search_dir else ""
        if src and os.path.exists(src):
            shutil.copy2(src, export_dir)
            print(f"{fname} copiado")
            break
    else:
        print(f"{fname} no encontrado")

print(f"\nArchivos exportados en {export_dir}:")

In [None]:
# Celda 6: Verificación de resultados
import json, os

DRIVE_CKPT = "/content/drive/MyDrive/TFM/finetune_checkpoints"

# Training Summary
for d in [DRIVE_CKPT, os.path.join(DRIVE_CKPT, "final")]:
    sp = os.path.join(d, "training_summary.json")
    if os.path.exists(sp):
        with open(sp) as f:
            summary = json.load(f)
        print("=" * 60)
        print("RESUMEN DEL ENTRENAMIENTO")
        print("=" * 60)
        for k, v in summary.items():
            print(f"  {k}: {v}")
        break
else:
    print("training_summary.json no encontrado")

# Runtime acumulado
rt_path = os.path.join(DRIVE_CKPT, "cumulative_runtime.json")
if os.path.exists(rt_path):
    with open(rt_path) as f:
        rt = json.load(f)
    print(f"\n{'='*60}")
    print("RUNTIME ACUMULADO")
    print(f"{'='*60}")
    print(f"  Total: {rt.get('total_runtime_sec',0)/60:.2f} min ({rt.get('total_runtime_sec',0)/3600:.2f} h)")
    print(f"  Última sesión: {rt.get('session_runtime_sec',0)/60:.2f} min")
    print(f"  Sesiones previas: {rt.get('previous_runtime_sec',0)/60:.2f} min")

# Log history
for d in [DRIVE_CKPT, os.path.join(DRIVE_CKPT, "final")]:
    lp = os.path.join(d, "training_log_history.json")
    if os.path.exists(lp):
        with open(lp) as f:
            logs = json.load(f)
        train_entries = [e for e in logs if 'loss' in e and 'eval_loss' not in e]
        eval_entries = [e for e in logs if 'eval_loss' in e]
        print(f"\n{'='*60}")
        print(f"LOG HISTORY: {len(logs)} entradas")
        print(f"{'='*60}")
        print(f"  Train loss entries: {len(train_entries)}")
        print(f"  Eval loss entries: {len(eval_entries)}")
        if train_entries:
            print(f"  Steps: {train_entries[0].get('step','?')} -> {train_entries[-1].get('step','?')}")
            print(f"  Loss: {train_entries[0].get('loss','?')} -> {train_entries[-1].get('loss','?')}")
        if eval_entries:
            print(f"  Eval loss: {eval_entries[0].get('eval_loss','?')} -> {eval_entries[-1].get('eval_loss','?')}")
        break
else:
    print("training_log_history.json no encontrado")

# Checkpoints en Drive
print(f"\n{'='*60}")
print("CHECKPOINTS EN DRIVE")
print("=" * 60)
if os.path.exists(DRIVE_CKPT):
    for item in sorted(os.listdir(DRIVE_CKPT)):
        full = os.path.join(DRIVE_CKPT, item)
        if os.path.isdir(full):
            n_files = len(os.listdir(full))
            print(f"  [dir] {item} ({n_files} archivos)")
        else:
            size = os.path.getsize(full) / 1e6
            print(f"  [file] {item} ({size:.1f} MB)")

In [None]:
# Celda 7: Verificar entorno de trabajo (GPU, RAM, disco)
import subprocess, os, psutil

print("=" * 60)
print("ENTORNO DE EJECUCIÓN - GOOGLE COLAB")
print("=" * 60)

# GPU
print("\n--- GPU ---")
try:
    gpu_info = subprocess.check_output(["nvidia-smi"], text=True)
    print(gpu_info)
except Exception:
    print("No se detectó GPU (usando CPU)")

# GPU con PyTorch
print("--- PyTorch GPU ---")
try:
    import torch
    print(f"  CUDA disponible: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"  Dispositivo: {torch.cuda.get_device_name(0)}")
        print(f"  VRAM total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"  VRAM reservada: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
        print(f"  VRAM asignada: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
        print(f"  Compute capability: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
    print(f"  PyTorch version: {torch.__version__}")
except ImportError:
    print("  PyTorch no instalado aún")

# RAM
print("\n--- RAM ---")
ram = psutil.virtual_memory()
print(f"  Total: {ram.total / 1e9:.2f} GB")
print(f"  Disponible: {ram.available / 1e9:.2f} GB")
print(f"  Usada: {ram.used / 1e9:.2f} GB ({ram.percent}%)")

# CPU
print("\n--- CPU ---")
print(f"  Cores físicos: {psutil.cpu_count(logical=False)}")
print(f"  Cores lógicos: {psutil.cpu_count(logical=True)}")
try:
    cpu_model = subprocess.check_output(["cat", "/proc/cpuinfo"], text=True)
    for line in cpu_model.split("\n"):
        if "model name" in line:
            print(f"  Modelo: {line.split(':')[1].strip()}")
            break
except Exception:
    pass

# Disco
print("\n--- Disco ---")
disk = psutil.disk_usage("/content" if os.path.exists("/content") else "/")
print(f"  Total: {disk.total / 1e9:.2f} GB")
print(f"  Usado: {disk.used / 1e9:.2f} GB")
print(f"  Libre: {disk.free / 1e9:.2f} GB ({100 - disk.percent:.1f}%)")

# Tipo de runtime
print("\n--- Runtime Colab ---")
try:
    if os.path.exists("/proc/driver/nvidia/version"):
        with open("/proc/driver/nvidia/version") as f:
            print(f"  Driver NVIDIA: {f.readline().strip()}")
except Exception:
    pass

# Detectar si es Colab Pro
print(f"  RAM total: {'> 12 GB (Pro/Pro+)' if ram.total > 13e9 else '<= 12 GB (Free)'}")
if torch.cuda.is_available():
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    gpu_name = torch.cuda.get_device_name(0)
    if "A100" in gpu_name:
        print(f"  GPU: {gpu_name} - Colab Pro+ tier")
    elif "V100" in gpu_name or vram_gb > 15:
        print(f"  GPU: {gpu_name} ({vram_gb:.0f} GB) - Colab Pro tier")
    else:
        print(f"  GPU: {gpu_name} ({vram_gb:.0f} GB) - Free/Pro tier")

print("\n" + "=" * 60)