# Deep Learning with MLflow

In [10]:
import mlflow

# The set_experiment API creates a new experiment if it doesn't exist.
mlflow.set_experiment("Deep Learning Experiment")
mlflow.set_tracking_uri("http://localhost:5002")

# IMPORTANT: Enable system metrics monitoring
mlflow.config.enable_system_metrics_logging()
mlflow.config.set_system_metrics_sampling_interval(1)

2025/11/17 12:38:08 INFO mlflow.tracking.fluent: Experiment with name 'Deep Learning Experiment' does not exist. Creating a new experiment.


## Prepare the dataset

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and prepare data
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.FashionMNIST("data", train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST("data", train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000)

In [19]:
device

device(type='cuda')

In [12]:
train_dataset

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

## Define the model and optimizer

In [None]:
import torch.nn as nn

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),   # Input layer ‚Üí Hidden 1
            nn.ReLU(),                 # Activation
            nn.Linear(512, 512),       # Hidden 1 ‚Üí Hidden 2
            nn.ReLU(),                 # Activation
            nn.Linear(512, 10),        # Hidden 2 ‚Üí Output layer (10 classes)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork().to(device)

In [14]:
# Training parameters
params = {
    "epochs": 5,
    "learning_rate": 1e-3,
    "batch_size": 64,
    "optimizer": "SGD",
    "model_type": "MLP",
    "hidden_units": [512, 512],
}

# Define optimizer and loss function
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=params["learning_rate"])

In [18]:
with mlflow.start_run(run_name="study") as run:
    # Log training parameters
    mlflow.log_params(params)

    for epoch in range(params["epochs"]):
        model.train()
        train_loss, correct, total = 0, 0, 0

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)

            # Forward pass
            optimizer.zero_grad()
            output = model(data)
            loss = loss_fn(output, target)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Calculate metrics
            train_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            # Log batch metrics (every 100 batches)
            if batch_idx % 100 == 0:
                batch_loss = train_loss / (batch_idx + 1)
                batch_acc = 100.0 * correct / total
                mlflow.log_metrics(
                    {"batch_loss": batch_loss, "batch_accuracy": batch_acc},
                    step=epoch * len(train_loader) + batch_idx,
                )

        # Calculate epoch metrics
        epoch_loss = train_loss / len(train_loader)
        epoch_acc = 100.0 * correct / total

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = loss_fn(output, target)

                val_loss += loss.item()
                _, predicted = output.max(1)
                val_total += target.size(0)
                val_correct += predicted.eq(target).sum().item()

        # Calculate and log epoch validation metrics
        val_loss = val_loss / len(test_loader)
        val_acc = 100.0 * val_correct / val_total

        # Log epoch metrics
        mlflow.log_metrics(
            {
                "train_loss": epoch_loss,
                "train_accuracy": epoch_acc,
                "val_loss": val_loss,
                "val_accuracy": val_acc,
            },
            step=epoch,
        )
        # Log checkpoint at the end of each epoch
        mlflow.pytorch.log_model(model, name=f"checkpoint_{epoch}")

        print(
            f"Epoch {epoch+1}/{params['epochs']}, "
            f"Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, "
            f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%"
        )

    # Log the final trained model
    model_info = mlflow.pytorch.log_model(model, name="final_model")

2025/11/17 15:10:09 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/11/17 15:10:09 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Epoch 1/5, Train Loss: 1.7445, Train Acc: 54.47%, Val Loss: 1.2563, Val Acc: 63.98%




Epoch 2/5, Train Loss: 1.0328, Train Acc: 67.38%, Val Loss: 0.9026, Val Acc: 68.96%




Epoch 3/5, Train Loss: 0.8226, Train Acc: 72.21%, Val Loss: 0.7785, Val Acc: 73.32%




Epoch 4/5, Train Loss: 0.7260, Train Acc: 75.77%, Val Loss: 0.7045, Val Acc: 76.00%




Epoch 5/5, Train Loss: 0.6624, Train Acc: 77.94%, Val Loss: 0.6515, Val Acc: 77.65%


2025/11/17 15:11:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/11/17 15:11:09 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


üèÉ View run study at: http://localhost:5002/#/experiments/716386441762713030/runs/7acdafcf0f4745f89f968c09559ab097
üß™ View experiment at: http://localhost:5002/#/experiments/716386441762713030


In [None]:
# Load the final model
model = mlflow.pytorch.load_model("runs:/7acdafcf0f4745f89f968c09559ab097/final_model")
# or load a checkpoint
# model = mlflow.pytorch.load_model("runs:/<run_id>/checkpoint_<epoch>")
model.to(device)
model.eval()

# Resume the previous run to log test metrics
with mlflow.start_run(run_id=run.info.run_id) as run:
    # Evaluate the model on the test set
    test_loss, test_correct, test_total = 0, 0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
        output = model(data)
        loss = loss_fn(output, target)

        test_loss += loss.item()
        _, predicted = output.max(1)
        test_total += target.size(0)
        test_correct += predicted.eq(target).sum().item()

    # Calculate and log final test metrics
    test_loss = test_loss / len(test_loader)
    test_acc = 100.0 * test_correct / test_total

    mlflow.log_metrics({"test_loss": test_loss, "test_accuracy": test_acc})
    print(f"Final Test Accuracy: {test_acc:.2f}%")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts:   0%|          | 0/1 [04:16<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:03<00:00,  1.53it/s] 
2025/11/17 15:22:45 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/11/17 15:22:45 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/11/17 15:22:46 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/11/17 15:22:46 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Final Test Accuracy: 77.70%
üèÉ View run study at: http://localhost:5002/#/experiments/716386441762713030/runs/7acdafcf0f4745f89f968c09559ab097
üß™ View experiment at: http://localhost:5002/#/experiments/716386441762713030
