In [1]:
import os
import io
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import DataLoader, ConcatDataset
from PIL import Image
from pyspark.sql import SparkSession
from pyspark.ml.torch.distributor import TorchDistributor

In [2]:
class DeepResNet18(nn.Module):
    def __init__(self):
        super().__init__()

        base = models.resnet18(weights=None)
        in_features = base.fc.in_features

        base.fc = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),

            nn.Linear(256, 256),
            nn.ReLU(),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),

            nn.Linear(128, 128),
            nn.ReLU(),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, 32),
            nn.ReLU(),

            nn.Linear(32, 2)
        )

        # Se deben copiar las capas igual que en train_fn
        self.conv1 = base.conv1
        self.bn1 = base.bn1
        self.relu = base.relu
        self.maxpool = base.maxpool

        self.layer1 = base.layer1
        self.layer2 = base.layer2
        self.layer3 = base.layer3
        self.layer4 = base.layer4

        self.avgpool = base.avgpool
        self.fc = base.fc

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [3]:
spark = (
    SparkSession.builder
        .appName("BrainTumor-ResNet18-Distributed-HDFS")
        .master("spark://100.108.67.1:7077")
        .config("spark.executor.instances", "2")

        # GPU CONFIG
        .config("spark.executor.resource.gpu.amount", "1")
        .config("spark.task.resource.gpu.amount", "1")
        .config("spark.executor.resource.gpu.discoveryScript", "/usr/local/bin/get-gpus.sh")

        # Networking
        .config("spark.executorEnv.NCCL_SOCKET_IFNAME", "tailscale0")
        .config("spark.executorEnv.GLOO_SOCKET_IFNAME", "tailscale0")

        .getOrCreate()
)

print("‚úì Sesi√≥n de Spark creada exitosamente")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/26 03:32:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


‚úì Sesi√≥n de Spark creada exitosamente


In [4]:
def train_fn(
    epochs=3,
    lr=1e-4,
    batch_size=32,
    optimizer_name="adam"
):

    import torch
    import torch.nn as nn
    import torch.optim as optim
    import io
    import os
    import pyarrow.fs as fs
    from torchvision import transforms, models
    from torch.utils.data import DataLoader, ConcatDataset
    from PIL import Image

    print("=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===", flush=True)

    rank = int(os.environ["RANK"])

    # Shards balanceados
    train_shards = [
        f"hdfs://namenode:9000/data/brain_balanced/shard_bal_{i:04d}.pt"
        for i in range(8)
    ]

    if rank == 0:
        my_shards = train_shards[0:8:2]
    else:
        my_shards = train_shards[1:8:2]

    class ShardDataset(torch.utils.data.Dataset):
        def __init__(self, path, transform):
            fs_conn = fs.HadoopFileSystem("namenode", 9000)
            with fs_conn.open_input_file(path) as f:
                data = f.read()
            shard = torch.load(io.BytesIO(data))
            self.images = shard["images"]
            self.labels = shard["labels"].long()
            self.transform = transform

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, i):
            img = Image.fromarray(self.images[i].numpy())
            return self.transform(img), self.labels[i]

    tf = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    datasets = [ShardDataset(s, tf) for s in my_shards]
    full = ConcatDataset(datasets)

    dataloader = DataLoader(
        full,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # =======================
    # MODELO CON 7 CAPAS EXTRA
    # =======================
    base = models.resnet18(weights=None)

    in_features = base.fc.in_features

    # Reemplazamos 'fc' por un head profundo
    base.fc = nn.Sequential(
        nn.Linear(in_features, 512),
        nn.ReLU(),
        nn.BatchNorm1d(512),

        nn.Linear(512, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),

        nn.Linear(256, 256),
        nn.ReLU(),

        nn.Linear(256, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128),

        nn.Linear(128, 128),
        nn.ReLU(),

        nn.Linear(128, 64),
        nn.ReLU(),

        nn.Linear(64, 32),
        nn.ReLU(),

        nn.Linear(32, 2) # SALIDA FINAL
    )

    model = base.to(device)

    criterion = nn.CrossEntropyLoss()

    # OPTIMIZADOR DIN√ÅMICO
    opt_name = optimizer_name.lower()
    if opt_name == "adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif opt_name == "sgd":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    else:
        raise ValueError(f"Optimizador no soportado: {optimizer_name}")

    scaler = torch.amp.GradScaler('cuda')

    # ENTRENAMIENTO
    for epoch in range(epochs):
        total_loss = 0
        steps = 0

        for imgs, labels in dataloader:
            imgs = imgs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                outputs = model(imgs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            steps += 1

        print(f"[Worker {rank}] √âpoca {epoch+1}/{epochs} P√©rdida={total_loss/steps:.4f}")

    # DEVOLVER MODELO SOLO EN RANK 0
    if rank == 0:
        buf = io.BytesIO()
        torch.save(model.state_dict(), buf)
        buf.seek(0)
        return buf.getvalue()

    return None

In [5]:
# ====================================================================
# CONFIGURACI√ìN DE EXPERIMENTOS M√öLTIPLES
# ====================================================================

import mlflow
import dagshub
import shutil
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Inicializar Dagshub + MLflow (UNA SOLA VEZ al inicio)
dagshub.init(
    repo_owner='picantitoDev',
    repo_name='percepcion-proyecto',
    mlflow=True
)

mlflow.set_tracking_uri("https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow")
mlflow.set_experiment("Proyecto Percepcion")

<Experiment: artifact_location='mlflow-artifacts:/7f8fca52276a46fd914d2197509806ad', creation_time=1764140493088, experiment_id='1', last_update_time=1764140493088, lifecycle_stage='active', name='Proyecto Percepcion', tags={}>

In [6]:
# Definir los experimentos
experiments = [
    {"epochs": 5, "lr": 0.01, "batch_size": 32, "optimizer": "adam"},
    {"epochs": 5, "lr": 0.001, "batch_size": 32, "optimizer": "adam"},
    {"epochs": 5, "lr": 0.0001, "batch_size": 32, "optimizer": "adam"},
    
    {"epochs": 10, "lr": 0.01, "batch_size": 32, "optimizer": "adam"},
    {"epochs": 10, "lr": 0.001, "batch_size": 32, "optimizer": "adam"},
    {"epochs": 10, "lr": 0.0001, "batch_size": 32, "optimizer": "adam"},
    
    {"epochs": 15, "lr": 0.01, "batch_size": 32, "optimizer": "sgd"},
    {"epochs": 15, "lr": 0.001, "batch_size": 32, "optimizer": "sgd"},
    {"epochs": 15, "lr": 0.0001, "batch_size": 32, "optimizer": "sgd"},
]

In [7]:
# Test shards (constante para todos los experimentos)
test_shards = [
    "hdfs://namenode:9000/data/brain_balanced/shard_bal_0008.pt",
    "hdfs://namenode:9000/data/brain_balanced/shard_bal_0009.pt"
]

# Transformaciones para test (constante)
tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485,0.456,0.406],
        std=[0.229,0.224,0.225]
    )
])

def load_pt_with_spark(hdfs_path):
    content = spark.sparkContext.binaryFiles(hdfs_path).take(1)[0][1]
    return torch.load(io.BytesIO(content))

In [8]:
best_f1 = -1
best_state_dict = None
best_config = None

for idx, cfg in enumerate(experiments, 1):

    print("\n" + "="*70)
    print(f" EXPERIMENTO {idx}/{len(experiments)}")
    print(f" Config: {cfg}")
    print("="*70)

    # ENTRENAMIENTO
    model_bytes = TorchDistributor(
        num_processes=2,
        local_mode=False,
        use_gpu=True
    ).run(
        train_fn,
        cfg["epochs"],
        cfg["lr"],
        cfg["batch_size"],
        cfg["optimizer"]
    )

    # RECONSTRUIR MODELO CORRECTO
    state_dict = torch.load(io.BytesIO(model_bytes), map_location="cpu")
    model = DeepResNet18()
    model.load_state_dict(state_dict)
    model.eval()

    # PREPARAR TEST
    X, Y = [], []
    for p in test_shards:
        shard = load_pt_with_spark(p)
        for i in range(len(shard["labels"])):
            img = Image.fromarray(shard["images"][i].numpy())
            X.append(tf(img).unsqueeze(0))
            Y.append(int(shard["labels"][i]))

    criterion = nn.CrossEntropyLoss()
    preds = []
    total_loss = 0

    with torch.no_grad():
        for img, y in zip(X, Y):
            out = model(img)
            loss = criterion(out, torch.tensor([y]))
            total_loss += loss.item()
            preds.append(out.argmax(1).item())

    avg_loss = total_loss / len(Y)
    acc = accuracy_score(Y, preds)
    prec = precision_score(Y, preds)
    rec = recall_score(Y, preds)
    f1 = f1_score(Y, preds)

    print(f" Resultados Exp {idx}:")
    print(f"  Loss:      {avg_loss:.4f}")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1:        {f1:.4f}")

    # MLflow aislado
    try:
        with mlflow.start_run(run_name=f"exp_{idx}_{cfg['optimizer']}"):
            mlflow.log_params(cfg)
            mlflow.log_metric("test_loss", avg_loss)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)
        print(" ‚úì Registrado en MLflow")
    except:
        print(" ‚ö† MLflow fall√≥ pero el entrenamiento contin√∫a")

    # MEJOR MODELO
    if f1 > best_f1:
        best_f1 = f1
        best_state_dict = state_dict
        best_config = cfg.copy()
        print(f" ‚òÖ Nuevo mejor modelo | F1={best_f1:.4f}")


print("\n‚úì ENTRENAMIENTO COMPLETO")
print("Mejor F1 =", best_f1)


 EXPERIMENTO 1/9
 Config: {'epochs': 5, 'lr': 0.01, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:32:51,935 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:32:51,521 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 1] √âpoca 1/5 P√©rdida=0.6239
[Worker 1] √âpoca 2/5 P√©rdida=0.5846
[Worker 1] √âpoca 3/5 P√©rdida=0.5556
[Worker 0] √âpoca 1/5 P√©rdida=0.5934
[Worker 1] √âpoca 4/5 P√©rdida=0.5247
[Worker 0] √âpoca 2/5 P√©rdida=0.5483
[Worker 1] √âpoca 5/5 P√©rdida=0.5619
[Worker 0] √âpoca 3/5 P√©rdida=0.5191
[Worker 0] √âpoca 4/5 P√©rdida=0.4753
[Worker 0] √âpoca 5/5 P√©rdida=0.4480
Finished distributed training with 2 exec

 Resultados Exp 1:
  Loss:      0.4216
  Accuracy:  0.7990
  Precision: 0.7447
  Recall:    0.9100
  F1:        0.8191
üèÉ View run exp_1_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/7dd3202c9cea4b75b6d85c7bf927d69a
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.8191

 EXPERIMENTO 2/9
 Config: {'epochs': 5, 'lr': 0.001, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:34:54,762 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:34:58,024 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 0] √âpoca 1/5 P√©rdida=0.5838
[Worker 0] √âpoca 2/5 P√©rdida=0.4558
[Worker 1] √âpoca 1/5 P√©rdida=0.5917
[Worker 0] √âpoca 3/5 P√©rdida=0.3955
[Worker 1] √âpoca 2/5 P√©rdida=0.4597
[Worker 0] √âpoca 4/5 P√©rdida=0.2941
[Worker 1] √âpoca 3/5 P√©rdida=0.3773
[Worker 1] √âpoca 4/5 P√©rdida=0.2879
[Worker 0] √âpoca 5/5 P√©rdida=0.3140
[Worker 1] √âpoca 5/5 P√©rdida=0.2725                                 (0 + 2) /

 Resultados Exp 2:
  Loss:      0.4318
  Accuracy:  0.8700
  Precision: 0.9302
  Recall:    0.8000
  F1:        0.8602
üèÉ View run exp_2_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/6407a49996264a46877459157c467eda
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.8602

 EXPERIMENTO 3/9
 Config: {'epochs': 5, 'lr': 0.0001, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:37:15,651 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:37:18,740 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 1] √âpoca 1/5 P√©rdida=0.6585
[Worker 1] √âpoca 2/5 P√©rdida=0.4574
[Worker 1] √âpoca 3/5 P√©rdida=0.2497
[Worker 1] √âpoca 4/5 P√©rdida=0.0949
[Worker 1] √âpoca 5/5 P√©rdida=0.0955
[Worker 0] √âpoca 1/5 P√©rdida=0.6658
[Worker 0] √âpoca 2/5 P√©rdida=0.5051
[Worker 0] √âpoca 3/5 P√©rdida=0.3030
[Worker 0] √âpoca 4/5 P√©rdida=0.1625
[Worker 0] √âpoca 5/5 P√©rdida=0.1118
Finished distributed training with 2 exec

 Resultados Exp 3:
  Loss:      0.4177
  Accuracy:  0.8550
  Precision: 0.7877
  Recall:    0.9720
  F1:        0.8702
üèÉ View run exp_3_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/1d387edae41e40e1aa8b5a00f7656bfe
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.8702

 EXPERIMENTO 4/9
 Config: {'epochs': 10, 'lr': 0.01, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:39:57,076 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:39:58,213 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 0] √âpoca 1/10 P√©rdida=0.6271
[Worker 0] √âpoca 2/10 P√©rdida=0.5997
[Worker 0] √âpoca 3/10 P√©rdida=0.6381
[Worker 0] √âpoca 4/10 P√©rdida=0.5745
[Worker 1] √âpoca 1/10 P√©rdida=0.6470
[Worker 0] √âpoca 5/10 P√©rdida=0.5941
[Worker 1] √âpoca 2/10 P√©rdida=0.5514
[Worker 0] √âpoca 6/10 P√©rdida=0.5106
[Worker 1] √âpoca 3/10 P√©rdida=0.5630                                (0 + 2) / 2]
[Worker 0] √âpoca 7/10 P√©

 Resultados Exp 4:
  Loss:      0.6448
  Accuracy:  0.6710
  Precision: 0.6535
  Recall:    0.7280
  F1:        0.6887
üèÉ View run exp_4_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/75bf2cdd6cd2420ebafb83e8d7c6c776
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow

 EXPERIMENTO 5/9
 Config: {'epochs': 10, 'lr': 0.001, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:42:33,769 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:42:36,385 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 0] √âpoca 1/10 P√©rdida=0.5370
[Worker 0] √âpoca 2/10 P√©rdida=0.4148
[Worker 0] √âpoca 3/10 P√©rdida=0.3081
[Worker 0] √âpoca 4/10 P√©rdida=0.2340
[Worker 0] √âpoca 5/10 P√©rdida=0.2737
[Worker 0] √âpoca 6/10 P√©rdida=0.1834
[Worker 1] √âpoca 1/10 P√©rdida=0.6050
[Worker 0] √âpoca 7/10 P√©rdida=0.1520
[Worker 1] √âpoca 2/10 P√©rdida=0.4964
[Worker 0] √âpoca 8/10 P√©rdida=0.1399
[Worker 0] √âpoca 9/10 P√©rdida

 Resultados Exp 5:
  Loss:      0.1636
  Accuracy:  0.9440
  Precision: 0.9933
  Recall:    0.8940
  F1:        0.9411
üèÉ View run exp_5_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/3daae63c2650486cb9a885c78b58b563
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.9411

 EXPERIMENTO 6/9
 Config: {'epochs': 10, 'lr': 0.0001, 'batch_size': 32, 'optimizer': 'adam'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:45:02,488 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:45:05,701 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 1] √âpoca 1/10 P√©rdida=0.6648
[Worker 1] √âpoca 2/10 P√©rdida=0.4780
[Worker 1] √âpoca 3/10 P√©rdida=0.2525
[Worker 1] √âpoca 4/10 P√©rdida=0.1320
[Worker 1] √âpoca 5/10 P√©rdida=0.1034
[Worker 0] √âpoca 1/10 P√©rdida=0.6536
[Worker 1] √âpoca 6/10 P√©rdida=0.0667
[Worker 0] √âpoca 2/10 P√©rdida=0.4724
[Worker 1] √âpoca 7/10 P√©rdida=0.0765
[Worker 0] √âpoca 3/10 P√©rdida=0.2571                                

 Resultados Exp 6:
  Loss:      0.1402
  Accuracy:  0.9480
  Precision: 0.9516
  Recall:    0.9440
  F1:        0.9478
üèÉ View run exp_6_adam at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/b41289702dde45999d7d7f7c4e2187c8
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.9478

 EXPERIMENTO 7/9
 Config: {'epochs': 15, 'lr': 0.01, 'batch_size': 32, 'optimizer': 'sgd'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:47:38,171 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:47:38,710 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Worker 1] √âpoca 1/15 P√©rdida=0.6621
[Worker 1] √âpoca 2/15 P√©rdida=0.4339
[Worker 1] √âpoca 3/15 P√©rdida=0.3146
[Worker 1] √âpoca 4/15 P√©rdida=0.2073
[Worker 1] √âpoca 5/15 P√©rdida=0.1437
[Worker 1] √âpoca 6/15 P√©rdida=0.1315                                (0 + 2) / 2]
[Worker 0] √âpoca 1/15 P√©rdida=0.6424
[Worker 1] √âpoca 7/15 P√©rdida=0.1020
[Worker 0] √âpoca 2/15 P√©rdida=0.4123
[Worker 1] √âpoca 8/15 P√©

 Resultados Exp 7:
  Loss:      0.1086
  Accuracy:  0.9600
  Precision: 0.9389
  Recall:    0.9840
  F1:        0.9609
üèÉ View run exp_7_sgd at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/a1156fb4923847f29e5955d85da65e16
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow
 ‚òÖ Nuevo mejor modelo | F1=0.9609

 EXPERIMENTO 8/9
 Config: {'epochs': 15, 'lr': 0.001, 'batch_size': 32, 'optimizer': 'sgd'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:51:38,091 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:51:39,719 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:52:40,527 WARN  [main] impl.BlockReaderFactory (BlockReaderFactory.java:getRemoteBlockReaderFromTcp(772)) - I/O error constructing remote block reader for block BP-736167345-127.0.1.1-1763348595625:blk_1073746858_6034
org.apache.hadoop.net.ConnectTimeoutException: 60000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=/100.10

 Resultados Exp 8:
  Loss:      0.2022
  Accuracy:  0.9300
  Precision: 0.8826
  Recall:    0.9920
  F1:        0.9341
üèÉ View run exp_8_sgd at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/c67824f8b17e49969efda4543bc17a7f
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow

 EXPERIMENTO 9/9
 Config: {'epochs': 15, 'lr': 0.0001, 'batch_size': 32, 'optimizer': 'sgd'}


Started distributed training with 2 executor processes
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===                          (0 + 2) / 2]
=== ENTRENAMIENTO DISTRIBUIDO INICIADO ===
2025-11-26 03:56:59,260 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:57:00,224 WARN  [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(60)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-11-26 03:58:01,091 WARN  [main] impl.BlockReaderFactory (BlockReaderFactory.java:getRemoteBlockReaderFromTcp(772)) - I/O error constructing remote block reader for block BP-736167345-127.0.1.1-1763348595625:blk_1073746857_6033
org.apache.hadoop.net.ConnectTimeoutException: 60000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=/100.10

 Resultados Exp 9:
  Loss:      0.6567
  Accuracy:  0.7030
  Precision: 0.6473
  Recall:    0.8920
  F1:        0.7502
üèÉ View run exp_9_sgd at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/864af8bd2f3741e9b085b93abdba4fab
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
 ‚úì Registrado en MLflow

‚úì ENTRENAMIENTO COMPLETO
Mejor F1 = 0.9609375


In [11]:
print("\n" + "="*70)
print(" RECONSTRUYENDO EL MEJOR MODELO ")
print("="*70)

best_model = DeepResNet18()
best_model.load_state_dict(best_state_dict)

if os.path.exists("best_model"):
    shutil.rmtree("best_model")

mlflow.pytorch.save_model(best_model, "best_model")

# Guardar aqu√≠ el run_id ANTES de cerrar el run
run_id = None

try:
    with mlflow.start_run(run_name="best_model") as run:
        run_id = run.info.run_id      # <<< AQUI ES LA CLAVE
        mlflow.log_params(best_config)
        mlflow.log_metric("best_f1", best_f1)
        mlflow.log_artifacts("best_model")
    print("‚úì MODELO SUBIDO A MLFLOW")
except Exception as e:
    print("‚ö† Fall√≥ MLflow pero modelo local generado:", e)

print("run_id generado:", run_id)


 RECONSTRUYENDO EL MEJOR MODELO 
üèÉ View run best_model at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1/runs/e165bb5b82e54095bd38ec12b3efbadc
üß™ View experiment at: https://dagshub.com/picantitoDev/percepcion-proyecto.mlflow/#/experiments/1
‚úì MODELO SUBIDO A MLFLOW
run_id generado: e165bb5b82e54095bd38ec12b3efbadc


In [14]:
assert run_id is not None

model_uri = f"runs:/{run_id}/best_model"
model_name = "ResnetPercepcion"

from mlflow import MlflowClient
client = MlflowClient()

# Crear el modelo si no existe
try:
    client.create_registered_model(model_name)
    print(f"Modelo creado: {model_name}")
except:
    print(f"Modelo '{model_name}' ya existe")

# Crear versi√≥n
version = client.create_model_version(
    name=model_name,
    source=model_uri,
    run_id=run_id
)

print("‚úì Modelo registrado!")
print("Nombre:", model_name)
print("Versi√≥n:", version.version)

Modelo 'ResnetPercepcion' ya existe


2025/11/26 04:18:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ResnetPercepcion, version 1


‚úì Modelo registrado!
Nombre: ResnetPercepcion
Versi√≥n: 1


In [15]:
from mlflow import MlflowClient

client = MlflowClient()
client.transition_model_version_stage(
    name="ResnetPercepcion",
    version=1,
    stage="Production",
    archive_existing_versions=True
)

print("Modelo promovido a producci√≥n")

  client.transition_model_version_stage(


Modelo promovido a producci√≥n


In [10]:
spark.stop()