In [1]:
# create dummy dataset
import pandas as pd
import numpy as np 
import torch
from torch import Generator
from torch.utils.data import DataLoader,Dataset, dataloader,random_split

from dataclasses import dataclass

import lightning.pytorch as pl

import dagshub
import mlflow

import matplotlib.pyplot as plt

import tempfile
import os


# hp tunning library
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.logger.aim import AimLoggerCallback

from aim import Run
from aim.pytorch import track_params_dists

In [4]:
!wget https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv

--2026-01-23 04:01:10--  https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23105 (23K) [text/plain]
Saving to: ‚Äòdiabetes.csv.6‚Äô


2026-01-23 04:01:10 (152 MB/s) - ‚Äòdiabetes.csv.6‚Äô saved [23105/23105]



In [5]:

url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"

# Read the data
df = pd.read_csv(url)
df.head()


@dataclass
class DiabeticDataset(Dataset):
    X:torch.Tensor
    y:torch.Tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]

# normalization
class Normalization_dataset(Dataset):
    def __init__(self, base_dataset, mean, std):
        self.base_dataset = base_dataset
        self.mean = mean
        self.std = std

        # üî• preserve indices if base_dataset is a Subset
        if hasattr(base_dataset, "indices"):
            self.indices = base_dataset.indices

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        X, y = self.base_dataset[idx]
        X = (X - self.mean) / (self.std + 1e-8)
        return X, y

class DiabeticDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df,
        batch_size=16,
        train_ratio=0.7,
        val_ratio=0.15,
        seed=40
    ):
        super().__init__()
        self.df = df
        self.batch_size = batch_size
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.seed = seed

    def setup(self, stage=None):
        X = self.df.drop(columns="Outcome", axis=1).values
        y = self.df["Outcome"].values

        # convert into tensors
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.long)

        full_dataset = DiabeticDataset(X, y)

        n_total = len(full_dataset)
        n_train = int(self.train_ratio * n_total)
        n_val   = int(self.val_ratio * n_total)
        n_test  = n_total - n_train - n_val

        generator = torch.Generator().manual_seed(self.seed)

        self.train_ds, self.val_ds, self.test_ds = random_split(
            full_dataset,
            [n_train, n_val, n_test],
            generator=generator
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=False
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

    # ---------- Normalization (fit on train only) ----------
    def normalize_datasets(self):
        X_all = []

        for X, y in self.train_dataloader():
            X_all.append(X.cpu())

        X_all = torch.cat(X_all, dim=0)

        mean = X_all.mean(dim=0)
        std  = X_all.std(dim=0)

        # Wrap datasets
        self.train_ds = Normalization_dataset(self.train_ds, mean, std)
        self.val_ds   = Normalization_dataset(self.val_ds,   mean, std)
        self.test_ds  = Normalization_dataset(self.test_ds,  mean, std)

        return mean, std




In [3]:
dm = DiabeticDataModule(df=df, seed=36)
dm.setup()

mean, std = dm.normalize_datasets()

train_loader = dm.train_dataloader()
test_loader  = dm.test_dataloader()
valid_loader=dm.val_dataloader()

print("Mean:", mean)
print("Std:", std)




Mean: tensor([  3.8827, 120.9534,  69.1899,  19.9590,  77.6369,  31.8946,   0.4689,
         33.2737])
Std: tensor([  3.3960,  32.1912,  19.6791,  16.0622, 112.1099,   7.9516,   0.3178,
         11.5753])


In [4]:
# verify the dataset
type(dm.train_ds[0][1])
X,y=next(iter(train_loader))
X.shape,y.shape

(torch.Size([16, 8]), torch.Size([16]))

In [5]:
# Collect train data
X_train_list = []
y_train_list = []

for x, y in train_loader.dataset:
    X_train_list.append(x)
    y_train_list.append(y)

X_train = torch.stack(X_train_list, dim=0)  # (N_train, num_features)
y_train = torch.tensor(y_train_list)         # (N_train,)

# Collect test data
X_test_list = []
y_test_list = []

for x, y in test_loader.dataset:
    X_test_list.append(x)
    y_test_list.append(y)

X_test = torch.stack(X_test_list, dim=0)   # (N_test, num_features)
y_test = torch.tensor(y_test_list)    

from pathlib import Path
save_dir=Path.cwd().parent/'data'/'splits'
save_dir.mkdir(parents=True,exist_ok=True)

# File path
save_path = save_dir / "diabetes_normalized.pt"

torch.save({
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test
}, save_path)

In [9]:
# save split data inot csv and store in dvc
train_indices=dm.train_ds.indices
test_indices=dm.test_ds.indices
import pathlib
from pathlib import Path
data_dir=Path.cwd().parent/'data'
# Create 'splits' folder inside 'data' directory
splits_dir = data_dir / 'splits'

splits_dir.mkdir(parents=True, exist_ok=True)

df.iloc[train_indices].to_csv(splits_dir / 'train.csv', index=False)
df.iloc[test_indices].to_csv(splits_dir / 'test.csv', index=False)

# basic algo: logistic_regression

In [6]:
# basic algo: logistic Algorithm

import torch
import torch.nn as nn

class LogisticRgressionModel(nn.Module):
    def __init__(self, featur_dim):
        super().__init__()
        self.linear=nn.Linear(featur_dim,1)   # single output either 0 or 1
    
    def forward(self,x):
        return self.linear(x)

# setup model , loss and optimizer
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

featur_dim=8

model=LogisticRgressionModel(featur_dim=featur_dim)

lr=0.001

optimizer=torch.optim.Adam(model.parameters(),lr=lr)

criterion=nn.BCEWithLogitsLoss()


def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct=0
    total=0

    for X, y in loader:
        X = X.to(device)
        y = y.float().unsqueeze(1).to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # accuracy
        probs=torch.sigmoid(logits)
        predicts=(probs>0.5).long()
        correct += (predicts == y.long()).sum().item()
        total += y.size(0)


    return total_loss / len(loader),correct/total

def evaluate(model,loader,device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in loader:
            X = X.to(device)
            y = y.to(device)

            logits = model(X)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long().squeeze(1)

            correct += (preds == y).sum().item()
            total += y.size(0)

    return correct / total


In [7]:
import dagshub
dagshub.init(repo_owner='manikantmnnit', repo_name='diabetes_project', mlflow=True)


mlflow.set_tracking_uri('https://dagshub.com/manikantmnnit/diabetes_project.mlflow')

num_epochs = 50
mlflow.set_experiment("diabetes_logistic_regression")
with mlflow.start_run(run_name='log_reg_baseline'):
    mlflow.log_param("model", "logistic_regression")
    mlflow.log_param("optimizer", "Adam")
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param('Batch_size',num_epochs)

    for epoch in range(num_epochs):
        train_loss,train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        test_acc   = evaluate(model, test_loader, device)

        # ---- Log metrics per epoch ----
        mlflow.log_metric("train_log_loss", train_loss, step=epoch)
        mlflow.log_metric("train_accuracy", train_acc, step=epoch)
        mlflow.log_metric("test_accuracy", test_acc, step=epoch)

        if (epoch + 1) % 5 == 0:
            print(
                f"Epoch [{epoch+1}/{num_epochs}] | "
                f"Loss: {train_loss:.4f} | "
                f"Train Acc: {train_acc:.4f} | "
                f"Test Acc: {test_acc:.4f}"
            )
    
    # log model
    mlflow.pytorch.log_model(model,artifact_path='model')



Epoch [5/50] | Loss: 0.6972 | Train Acc: 0.5587 | Test Acc: 0.5259
Epoch [10/50] | Loss: 0.6339 | Train Acc: 0.6760 | Test Acc: 0.6552
Epoch [15/50] | Loss: 0.5906 | Train Acc: 0.7300 | Test Acc: 0.7069
Epoch [20/50] | Loss: 0.5659 | Train Acc: 0.7486 | Test Acc: 0.7069
Epoch [25/50] | Loss: 0.5422 | Train Acc: 0.7449 | Test Acc: 0.7241
Epoch [30/50] | Loss: 0.5240 | Train Acc: 0.7467 | Test Acc: 0.7414
Epoch [35/50] | Loss: 0.5127 | Train Acc: 0.7393 | Test Acc: 0.7586
Epoch [40/50] | Loss: 0.5072 | Train Acc: 0.7561 | Test Acc: 0.7672
Epoch [45/50] | Loss: 0.5015 | Train Acc: 0.7561 | Test Acc: 0.7672




Epoch [50/50] | Loss: 0.4938 | Train Acc: 0.7598 | Test Acc: 0.7672




üèÉ View run log_reg_baseline at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0/runs/2d183cc7bfdc4a84b8ff0bb1ed7cde4e
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0


# Using Ray Tune for HP tunning

In [11]:
from ray import tune
import torch
import torch.nn as nn
import os
import tempfile


def train_using_tune(config,  model_cls, feature_dim,df):

    device = config["device"]
    batch_size = int(config["batch_size"])
    lr = config["lr"]

    # üîπ Build DataModule INSIDE trial
    dm = DiabeticDataModule(
        df=df,
        batch_size=batch_size
    )
    dm.setup()
    dm.normalize_datasets()

    train_loader = dm.train_dataloader()
    valid_loader = dm.val_dataloader()
    test_loader  = dm.test_dataloader()

    # Build model
    model = model_cls(feature_dim).to(device)

    # üîπ optimizer
    if config["optimizer"] == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=config["lr"],
            weight_decay=config["weight_decay"]
        )
    else:
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=config["lr"],
            momentum=0.9,
            weight_decay=config["weight_decay"]
        )
    criterion = nn.BCEWithLogitsLoss()

    # ----- Restore checkpoint if exists -----
    checkpoint = tune.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as ckpt_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(ckpt_dir, "checkpoint.pt"),
                map_location=device
            )
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    # ----- Training loop -----
    max_epochs = config["max_num_epochs"]
    for epoch in range(max_epochs):

        # ===== Train =====
        model.train()
        total_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for X, y in train_loader:
            X = X.to(device)
            y = y.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            logits = model(X) # forward
            loss = criterion(logits, y)

            loss.backward()  # back propagation
            optimizer.step()

            total_train_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()
            correct_train += (preds == y.long()).sum().item()
            total_train += y.size(0)

        train_loss = total_train_loss / len(train_loader)
        train_acc = correct_train / total_train

        # ===== Validation =====
        model.eval()
        total_val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for X, y in valid_loader:
                X = X.to(device)
                y = y.float().unsqueeze(1).to(device)

                logits = model(X)
                loss = criterion(logits, y)

                total_val_loss += loss.item()

                probs = torch.sigmoid(logits)
                preds = (probs > 0.5).long()
                correct_val += (preds == y.long()).sum().item()
                total_val += y.size(0)

        val_loss = total_val_loss / len(valid_loader)
        val_acc = correct_val / total_val

        # ===== Save checkpoint + report =====
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            ckpt_path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
            torch.save((model.state_dict(), optimizer.state_dict()), ckpt_path)

            checkpoint = tune.Checkpoint.from_directory(temp_checkpoint_dir)

            tune.report(
                {
                    "train_loss": train_loss,
                    "train_accuracy": train_acc,
                    "val_loss": val_loss,
                    "val_accuracy": val_acc,
                },
                checkpoint=checkpoint
            )


# obtain best accurac based on best model uisng checkpoint
def test_best_model(best_result, model_cls, feature_dim, df):

    device = best_result.config["device"]

    # üîπ Rebuild DataModule
    dm = DiabeticDataModule(df=df, batch_size=best_result.config["batch_size"])
    dm.setup()
    dm.normalize_datasets()

    test_loader = dm.test_dataloader()

    # üîπ Build model
    best_trained_model = model_cls(feature_dim).to(device)

    # ----- Load best checkpoint -----
    checkpoint = best_result.checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        checkpoint_path = os.path.join(ckpt_dir, "checkpoint.pt")
        model_state, _ = torch.load(checkpoint_path, map_location=device)

    best_trained_model.load_state_dict(model_state)
    best_trained_model.eval()

    # ----- Test loop -----
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for X, y in test_loader:
            X = X.to(device)
            y = y.float().unsqueeze(1).to(device)

            logits = best_trained_model(X)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            correct_test += (preds == y.long()).sum().item()
            total_test += y.size(0)

    print(f"‚úÖ Best trial test accuracy: {correct_test / total_test:.4f}")


# train the model, find best performing one and load the trained netqork from checkpoint file

def main(config,  model_cls,df, feature_dim, gpus_per_trial=1):

    
    scheduler = ASHAScheduler(
        time_attr="training_iteration",
        max_t=config["max_num_epochs"],
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(
                train_using_tune,
                model_cls=model_cls,
                feature_dim=feature_dim,
                df=df
            ),
            resources={"cpu": 4, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            scheduler=scheduler,
            num_samples=config["num_trials"],
        ),
        param_space=config,
    )

    results = tuner.fit()

    best_result = results.get_best_result("val_loss", "min")

    print(f"üèÜ Best trial config: {best_result.config}")
    print(f"üèÜ Best trial final validation loss: {best_result.metrics['val_loss']}")
    print(f"üèÜ Best trial final validation accuracy: {best_result.metrics['val_accuracy']}")

    test_best_model(best_result, model_cls, feature_dim, df)





In [12]:
config =  {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "weight_decay": tune.loguniform(1e-6, 1e-2),
    "max_num_epochs": 30,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "num_trials": 15,
}

main(
    config=config,
    df=df,
    
    model_cls=LogisticRgressionModel,
    feature_dim=8,
    gpus_per_trial=1 if torch.cuda.is_available() else 0
)

[36m(train_using_tune pid=36212)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_14-06-05/train_using_tune_7c987_00000_0_batch_size=32,lr=0.0148,optimizer=sgd,weight_decay=0.0086_2026-01-28_14-06-05/checkpoint_000000)
[36m(train_using_tune pid=36212)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_14-06-05/train_using_tune_7c987_00000_0_batch_size=32,lr=0.0148,optimizer=sgd,weight_decay=0.0086_2026-01-28_14-06-05/checkpoint_000001)
[36m(train_using_tune pid=36212)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_14-06-05/train_using_tune_7c987_00000_0_batch_size=32,lr=0.0148,optimizer=sgd,weight_decay=0.0086_2026-01-28_14-06-05/checkpoint_000002)
[36m(train_using_tune pid=36212)[0m Checkpoint suc

üèÜ Best trial config: {'lr': 0.014751169373953247, 'batch_size': 32, 'optimizer': 'sgd', 'weight_decay': 0.008580864328817632, 'max_num_epochs': 30, 'device': 'cuda', 'num_trials': 15}
üèÜ Best trial final validation loss: 0.5601386427879333
üèÜ Best trial final validation accuracy: 0.7217391304347827
‚úÖ Best trial test accuracy: 0.7241


[36m(train_using_tune pid=48224)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_14-06-05/train_using_tune_7c987_00014_14_batch_size=64,lr=0.0408,optimizer=adam,weight_decay=0.0000_2026-01-28_14-06-05/checkpoint_000029)


# Ray Tune + MLflowLoggerCallback

In [13]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.air.integrations.mlflow import MLflowLoggerCallback


import dagshub
dagshub.init(repo_owner='manikantmnnit', repo_name='diabetes_project', mlflow=True)

MLFLOW_TRACKING_URI='https://dagshub.com/manikantmnnit/diabetes_project.mlflow'


EXPERIMENT_NAME = "diabetes_ray_tune"
mlflow.set_experiment(EXPERIMENT_NAME)

def main(config, model_cls, df, feature_dim, gpus_per_trial=0):

    scheduler = ASHAScheduler(
        time_attr="training_iteration",
        max_t=config["max_num_epochs"],
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(
                train_using_tune,
                model_cls=model_cls,
                feature_dim=feature_dim,
                df=df
            ),
            resources={"cpu": 4, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            scheduler=scheduler,
            num_samples=config["num_trials"],
        ),
        run_config=tune.RunConfig(
            name="ray_tune_diabetes",
            callbacks=[
                MLflowLoggerCallback(
                    tracking_uri=MLFLOW_TRACKING_URI,
                    experiment_name=EXPERIMENT_NAME,
                    save_artifact=True
                )
            ],
        ),
        param_space=config,
    )

    results = tuner.fit()

    best_result = results.get_best_result("val_loss", "min")

    print("üèÜ Best config:", best_result.config)
    print("üèÜ Best val_loss:", best_result.metrics["val_loss"])
    print("üèÜ Best val_accuracy:", best_result.metrics["val_accuracy"])

    test_best_model(best_result, model_cls, feature_dim, df)


In [14]:
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "weight_decay": tune.loguniform(1e-6, 1e-2),
    "max_num_epochs": 30,
    "num_trials": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
}

main(
    config=config,
    model_cls=LogisticRgressionModel,
    df=df,
    feature_dim=8,
    gpus_per_trial=1 if torch.cuda.is_available() else 0
)



[36m(train_using_tune pid=52287)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00000_0_batch_size=16,lr=0.0012,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=52287)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00000_0_batch_size=16,lr=0.0012,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=52287)[0m [2026-01-28 14:10:43,795 E 52287 52331] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=52287)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, p

üèÉ View run train_using_tune_0cde6_00000 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/fb473f7a821546afbc214a06b0a14f26
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=78885)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00001_1_batch_size=64,lr=0.0062,optimizer=adam,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=78885)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00001_1_batch_size=64,lr=0.0062,optimizer=adam,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=78885)[0m [2026-01-28 14:20:34,112 E 78885 78920] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=78885)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, p

üèÉ View run train_using_tune_0cde6_00001 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/4d8e0125cff040fabc2446e7e5fdfb39
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=115401)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00002_2_batch_size=32,lr=0.0611,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=115401)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00002_2_batch_size=32,lr=0.0611,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=115401)[0m [2026-01-28 14:30:24,163 E 115401 115430] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=115401)[0m Checkpoint successfully created at: Checkpoint(filesystem=lo

üèÉ View run train_using_tune_0cde6_00002 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/bf3a82fa61e54d9989f61d789cf59dfc
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=125514)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00003_3_batch_size=64,lr=0.0154,optimizer=sgd,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=125514)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00003_3_batch_size=64,lr=0.0154,optimizer=sgd,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=125514)[0m [2026-01-28 14:40:14,169 E 125514 125901] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=125514)[0m Checkpoint successfully created at: Checkpoint(filesystem=loca

üèÉ View run train_using_tune_0cde6_00003 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/8d44a62fd5bb4061ab2e4c1f8b0e3d8d
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=143391)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00004_4_batch_size=16,lr=0.0007,optimizer=adam,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=143391)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00004_4_batch_size=16,lr=0.0007,optimizer=adam,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=143391)[0m [2026-01-28 14:50:04,298 E 143391 143731] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=143391)[0m Checkpoint successfully created at: Checkpoint(filesystem=lo

üèÉ View run train_using_tune_0cde6_00004 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/7008fbdc25ba475e9af71207179b21cc
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=147145)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00005_5_batch_size=64,lr=0.0105,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=147145)[0m [2026-01-28 14:51:40,295 E 147145 147168] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


üèÉ View run train_using_tune_0cde6_00005 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/c628b8d3bb194a8aae74c3466015713a
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=148660)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00006_6_batch_size=16,lr=0.0074,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=148660)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00006_6_batch_size=16,lr=0.0074,optimizer=adam,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000001)
[36m(train_using_tune pid=148660)[0m [2026-01-28 14:52:19,433 E 148660 148945] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(train_using_tune pid=148660)[0m Checkpoint successfully created at: Checkpoint(filesystem=lo

üèÉ View run train_using_tune_0cde6_00006 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/e306ad32723b432ab3e27ee74ea7a23c
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=159545)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00007_7_batch_size=32,lr=0.0002,optimizer=adam,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=159545)[0m [2026-01-28 14:55:10,792 E 159545 159577] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


üèÉ View run train_using_tune_0cde6_00007 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/9e98675923fc425792d0c24dfc1d46ac
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=162052)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00008_8_batch_size=16,lr=0.0004,optimizer=sgd,weight_decay=0.0001_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=162052)[0m [2026-01-28 14:55:50,116 E 162052 162176] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


üèÉ View run train_using_tune_0cde6_00008 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/b6507f105c6445fc996cbb546ea0d271
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


[36m(train_using_tune pid=164396)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_0cde6_00009_9_batch_size=64,lr=0.0004,optimizer=sgd,weight_decay=0.0000_2026-01-28_14-10-07/checkpoint_000000)
[36m(train_using_tune pid=164396)[0m [2026-01-28 14:56:29,384 E 164396 164776] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


üèÉ View run train_using_tune_0cde6_00009 at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2/runs/cc3c23f32fea49869ef4b51048e099a7
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/2


2026-01-28 14:56:31,582	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/teamspace/studios/this_studio/ray_results/ray_tune_diabetes' in 0.0130s.
2026-01-28 14:56:31,594	INFO tune.py:1041 -- Total run time: 2783.92 seconds (2783.77 seconds for the tuning loop).


üèÜ Best config: {'lr': 0.015413931192308999, 'batch_size': 64, 'optimizer': 'sgd', 'weight_decay': 0.00013154690619449782, 'max_num_epochs': 30, 'num_trials': 10, 'device': 'cuda'}
üèÜ Best val_loss: 0.5458181798458099
üèÜ Best val_accuracy: 0.7217391304347827
‚úÖ Best trial test accuracy: 0.7155


# ray tune + AIM

In [None]:
# -----------------------------
# 1Ô∏è‚É£ Imports
# -----------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import tempfile
import os

from dataclasses import dataclass

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from ray.tune import Checkpoint

from aim import Run
from aim.pytorch import track_params_dists

# -----------------------------
# 2Ô∏è‚É£ Dataset & DataModule
# -----------------------------
url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"
df = pd.read_csv(url)

@dataclass
class DiabeticDataset(Dataset):
    X: torch.Tensor
    y: torch.Tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class Normalization_dataset(Dataset):
    def __init__(self, base_dataset, mean, std):
        self.base_dataset = base_dataset
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        X, y = self.base_dataset[idx]
        X = (X - self.mean) / (self.std + 1e-8)
        return X, y

class DiabeticDataModule:
    """Minimal Lightning-style datamodule"""
    def __init__(self, df, batch_size=32, train_ratio=0.7, val_ratio=0.15, seed=42):
        self.df = df
        self.batch_size = batch_size
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.seed = seed

    def setup(self):
        X = torch.tensor(self.df.drop(columns="Outcome").values, dtype=torch.float32)
        y = torch.tensor(self.df["Outcome"].values, dtype=torch.long)
        dataset = DiabeticDataset(X, y)

        n_total = len(dataset)
        n_train = int(self.train_ratio * n_total)
        n_val   = int(self.val_ratio * n_total)
        n_test  = n_total - n_train - n_val

        generator = torch.Generator().manual_seed(self.seed)
        self.train_ds, self.val_ds, self.test_ds = random_split(
            dataset, [n_train, n_val, n_test], generator=generator
        )

    def normalize_datasets(self):
        X_all = torch.cat([X for X, _ in DataLoader(self.train_ds, batch_size=self.batch_size)], dim=0)
        mean = X_all.mean(dim=0)
        std  = X_all.std(dim=0)

        self.train_ds = Normalization_dataset(self.train_ds, mean, std)
        self.val_ds   = Normalization_dataset(self.val_ds, mean, std)
        self.test_ds  = Normalization_dataset(self.test_ds, mean, std)

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=self.batch_size, shuffle=False)

# -----------------------------
# 3Ô∏è‚É£ Model
# -----------------------------
class LogisticRgressionModel(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.linear = nn.Linear(feature_dim, 1)

    def forward(self, x):
        return self.linear(x)

# -----------------------------
# 4Ô∏è‚É£ Ray Tune training function with Aim logging
# -----------------------------
def train_using_tune(config, model_cls, feature_dim, df):
    # 1Ô∏è‚É£ Aim run per trial
    run = Run(experiment="diabetes_ray_tune")
    run["lr"] = config["lr"]
    run["batch_size"] = int(config["batch_size"])
    run["optimizer"] = config["optimizer"]
    run["weight_decay"] = config["weight_decay"]
    run["max_num_epochs"] = config["max_num_epochs"]

    device = config["device"]
    batch_size = int(config["batch_size"])
    lr = config["lr"]

    # 2Ô∏è‚É£ DataModule
    dm = DiabeticDataModule(df=df, batch_size=batch_size)
    dm.setup()
    dm.normalize_datasets()

    train_loader = dm.train_dataloader()
    valid_loader = dm.val_dataloader()

    # 3Ô∏è‚É£ Model
    model = model_cls(feature_dim).to(device)

    # 4Ô∏è‚É£ Optimizer
    if config["optimizer"] == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config["weight_decay"])
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=config["weight_decay"])

    criterion = nn.BCEWithLogitsLoss()

    # 5Ô∏è‚É£ Restore checkpoint if exists
    checkpoint = tune.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as ckpt_dir:
            model_state, optimizer_state = torch.load(os.path.join(ckpt_dir, "checkpoint.pt"), map_location=device)
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    # 6Ô∏è‚É£ Training loop
    for epoch in range(config["max_num_epochs"]):
        model.train()
        total_train_loss, correct_train, total_train = 0, 0, 0

        for X, y in train_loader:
            X, y = X.to(device), y.float().unsqueeze(1).to(device)
            optimizer.zero_grad()
            logits = model(X)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            preds = (torch.sigmoid(logits) > 0.5).long()
            correct_train += (preds == y.long()).sum().item()
            total_train += y.size(0)

        train_loss = total_train_loss / len(train_loader)
        train_acc  = correct_train / total_train

        # Validation
        model.eval()
        total_val_loss, correct_val, total_val = 0, 0, 0
        with torch.no_grad():
            for X, y in valid_loader:
                X, y = X.to(device), y.float().unsqueeze(1).to(device)
                logits = model(X)
                loss = criterion(logits, y)
                total_val_loss += loss.item()
                preds = (torch.sigmoid(logits) > 0.5).long()
                correct_val += (preds == y.long()).sum().item()
                total_val += y.size(0)

        val_loss = total_val_loss / len(valid_loader)
        val_acc  = correct_val / total_val

        # Aim logging
        run.track(train_loss, name="train_loss", step=epoch)
        run.track(train_acc,  name="train_accuracy", step=epoch)
        run.track(val_loss,   name="val_loss", step=epoch)
        run.track(val_acc,    name="val_accuracy", step=epoch)
        track_params_dists(model, run=run)

        # Ray Tune checkpoint
        with tempfile.TemporaryDirectory() as temp_ckpt_dir:
            ckpt_path = os.path.join(temp_ckpt_dir, "checkpoint.pt")
            torch.save((model.state_dict(), optimizer.state_dict()), ckpt_path)
            checkpoint = tune.Checkpoint.from_directory(temp_ckpt_dir)
            tune.report(
                        {
                            "train_loss": train_loss,
                            "train_accuracy": train_acc,
                            "val_loss": val_loss,
                            "val_accuracy": val_acc
                        },
                        checkpoint=checkpoint
                    )


    run.close()


# -----------------------------
# 5Ô∏è‚É£ Set config & run tuner
# -----------------------------
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "weight_decay": tune.loguniform(1e-5, 1e-2),
    "max_num_epochs": 30,
    "num_trials": 5,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Assign model & input dimension
model_cls = LogisticRgressionModel
feature_dim = 8

# Determine GPUs for Ray
gpus_per_trial = 1 if torch.cuda.is_available() else 0

# Scheduler
scheduler = ASHAScheduler(
    time_attr="training_iteration",
    max_t=config["max_num_epochs"],
    grace_period=1,
    reduction_factor=2
)

# CLIReporter for console
reporter = CLIReporter(metric_columns=["train_loss","val_loss","train_accuracy","val_accuracy"])

# Run the tuner
tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train_using_tune,
                             model_cls=model_cls,
                             feature_dim=feature_dim,
                             df=df),
        resources={"cpu": 4, "gpu": gpus_per_trial}
    ),
    tune_config=tune.TuneConfig(
        metric="val_loss",
        mode="min",
        scheduler=scheduler,
        num_samples=config["num_trials"],
    ),
    run_config=tune.RunConfig(
        name="ray_tune_diabetes",
    ),
    param_space=config,
)
results = tuner.fit()


0,1
Current time:,2026-01-28 15:04:58
Running for:,00:01:22.09
Memory:,5.8/15.4 GiB

Trial name,status,loc,batch_size,lr,optimizer,weight_decay,iter,total time (s),train_loss,train_accuracy,val_loss
train_using_tune_854db_00000,TERMINATED,10.192.11.12:179598,32,0.00893382,sgd,2.20597e-05,30,5.47891,0.471682,0.776536,0.485459
train_using_tune_854db_00001,TERMINATED,10.192.11.12:180890,32,0.00132497,sgd,0.00616456,1,3.73397,0.765672,0.387337,0.763798
train_using_tune_854db_00002,TERMINATED,10.192.11.12:182159,64,0.000265434,adam,0.00290359,1,3.34358,0.792582,0.383613,0.800777
train_using_tune_854db_00003,TERMINATED,10.192.11.12:183042,64,0.0171474,adam,0.0015771,4,3.32032,0.49755,0.763501,0.540856
train_using_tune_854db_00004,TERMINATED,10.192.11.12:183216,16,0.00259731,adam,3.9543e-05,2,3.32809,0.676661,0.579143,0.648518


[36m(train_using_tune pid=179598)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000000)
[36m(train_using_tune pid=179598)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000001)
[36m(train_using_tune pid=179598)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000002)
[36m(train_using_tune pid=179598)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, pa

[36m(train_using_tune pid=183216)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00004_4_batch_size=16,lr=0.0026,optimizer=adam,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000001)


In [16]:
best_result = tuner.get_results().get_best_result("val_loss", "min")
best_result

Result(
  metrics={'train_loss': 0.47168173509485584, 'train_accuracy': 0.776536312849162, 'val_loss': 0.4854586720466614, 'val_accuracy': 0.7391304347826086},
  path='/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000029)
)

In [32]:
results

ResultGrid<[
  Result(
    metrics={'train_loss': 0.4783506831702064, 'train_accuracy': 0.776536312849162, 'val_loss': 0.5121450014412403, 'val_accuracy': 0.7478260869565218},
    path='/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_c5dfd_00000_0_batch_size=16,lr=0.0091,optimizer=sgd,weight_decay=0.0011_2026-01-26_16-47-59',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_c5dfd_00000_0_batch_size=16,lr=0.0091,optimizer=sgd,weight_decay=0.0011_2026-01-26_16-47-59/checkpoint_000029)
  ),
  Result(
    metrics={'train_loss': 0.6856310402645784, 'train_accuracy': 0.5716945996275605, 'val_loss': 0.7359677702188492, 'val_accuracy': 0.5565217391304348},
    path='/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_c5dfd_00001_1_batch_size=32,lr=0.0018,optimizer=sgd,weight_decay=0.0000_2026-01-26_16-47-59',
    filesystem='local',
    c

In [None]:
import torch
import torch.nn as nn
import tempfile
import os
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from aim import Run
from aim.pytorch import track_params_dists

# ------------------ Training function for Ray Tune ------------------
def train_using_tune(config, model_cls, feature_dim, df):
    """
    Train a single Ray Tune trial and log metrics to Aim.
    """
    # ----- Setup Aim run (1 per trial) -----
    run = Run(experiment="diabetes_ray_tune")

    # Log hyperparameters
    run["lr"] = config["lr"]
    run["batch_size"] = int(config["batch_size"])
    run["optimizer"] = config["optimizer"]
    run["weight_decay"] = config["weight_decay"]
    run["max_num_epochs"] = config["max_num_epochs"]

    device = config["device"]
    batch_size = int(config["batch_size"])
    lr = config["lr"]

    # ----- Prepare DataModule -----
    dm = DiabeticDataModule(df=df, batch_size=batch_size)
    dm.setup()
    dm.normalize_datasets()

    train_loader = dm.train_dataloader()
    val_loader = dm.val_dataloader()
    test_loader = dm.test_dataloader()

    # ----- Build model -----
    model = model_cls(feature_dim).to(device)

    # ----- Optimizer -----
    if config["optimizer"].lower() == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(), lr=lr, weight_decay=config["weight_decay"]
        )
    else:
        optimizer = torch.optim.SGD(
            model.parameters(), lr=lr, momentum=0.9, weight_decay=config["weight_decay"]
        )

    criterion = nn.BCEWithLogitsLoss()

    # ----- Restore checkpoint if exists -----
    checkpoint = tune.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as ckpt_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(ckpt_dir, "checkpoint.pt"), map_location=device
            )
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    # ----- Training Loop -----
    for epoch in range(config["max_num_epochs"]):
        # ------- TRAIN -------
        model.train()
        total_train_loss, correct_train, total_train = 0.0, 0, 0

        for X, y in train_loader:
            X, y = X.to(device), y.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            logits = model(X)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            preds = (torch.sigmoid(logits) > 0.5).long()
            correct_train += (preds == y.long()).sum().item()
            total_train += y.size(0)

        train_loss = total_train_loss / len(train_loader)
        train_acc = correct_train / total_train

        # ------- VALIDATION -------
        model.eval()
        total_val_loss, correct_val, total_val = 0.0, 0, 0

        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.float().unsqueeze(1).to(device)

                logits = model(X)
                loss = criterion(logits, y)
                total_val_loss += loss.item()

                preds = (torch.sigmoid(logits) > 0.5).long()
                correct_val += (preds == y.long()).sum().item()
                total_val += y.size(0)

        val_loss = total_val_loss / len(val_loader)
        val_acc = correct_val / total_val

        # ------- Aim logging -------
        run.track(train_loss, name="train_loss", step=epoch)
        run.track(train_acc, name="train_accuracy", step=epoch)
        run.track(val_loss, name="val_loss", step=epoch)
        run.track(val_acc, name="val_accuracy", step=epoch)

        # Track parameter distributions (weights & biases)
        track_params_dists(model, run=run)

        # ------- Save checkpoint & report to Ray Tune -------
        with tempfile.TemporaryDirectory() as tmp_dir:
            ckpt_path = os.path.join(tmp_dir, "checkpoint.pt")
            torch.save((model.state_dict(), optimizer.state_dict()), ckpt_path)
            tune_checkpoint = tune.Checkpoint.from_directory(tmp_dir)
            tune.report(
                metrics={
                    "train_loss": train_loss,
                    "train_accuracy": train_acc,
                    "val_loss": val_loss,
                    "val_accuracy": val_acc
                },
                checkpoint=tune_checkpoint
            )

    run.close()  # üîπ important

# ------------------ Test best trial ------------------
def test_best_model(best_result, model_cls, feature_dim, df):
    device = best_result.config["device"]
    model = model_cls(feature_dim).to(device)

    checkpoint = best_result.checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        model_state, _ = torch.load(os.path.join(ckpt_dir, "checkpoint.pt"), map_location=device)
    model.load_state_dict(model_state)
    model.eval()

    # Prepare test loader
    dm = DiabeticDataModule(df=df, batch_size=best_result.config["batch_size"])
    dm.setup()
    dm.normalize_datasets()
    test_loader = dm.test_dataloader()

    correct, total = 0, 0
    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.float().unsqueeze(1).to(device)
            preds = (torch.sigmoid(model(X)) > 0.5).long()
            correct += (preds == y.long()).sum().item()
            total += y.size(0)

    print(f"‚úÖ Best trial test set accuracy: {correct / total:.4f}")

# ------------------ Main Ray Tune + Aim ------------------
def run_ray_tune_aim(config, model_cls, feature_dim, df, gpus_per_trial=1):
    scheduler = ASHAScheduler(
        time_attr="training_iteration",
        max_t=config["max_num_epochs"],
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(
                train_using_tune,
                model_cls=model_cls,
                feature_dim=feature_dim,
                df=df
            ),
            resources={"cpu": 4, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            scheduler=scheduler,
            num_samples=config["num_trials"],
        ),
        param_space=config,
    )

    results = tuner.fit()
    best_result = results.get_best_result("val_loss", "min")

    print("üèÜ Best trial config:", best_result.config)
    print("üèÜ Best val_loss:", best_result.metrics["val_loss"])
    print("üèÜ Best val_accuracy:", best_result.metrics["val_accuracy"])

    test_best_model(best_result, model_cls, feature_dim, df)

    config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "weight_decay": tune.loguniform(1e-5, 1e-2),
    "max_num_epochs": 40,
    "num_trials": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
}

model_cls = LogisticRgressionModel
feature_dim = 8
gpus_per_trial = 1 if torch.cuda.is_available() else 0

run_ray_tune_aim(config, model_cls, feature_dim, df, gpus_per_trial)



[36m(train_using_tune pid=187326)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_15-09-37/train_using_tune_5cb3d_00000_0_batch_size=64,lr=0.0173,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-09-37/checkpoint_000000)
[36m(train_using_tune pid=187326)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_15-09-37/train_using_tune_5cb3d_00000_0_batch_size=64,lr=0.0173,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-09-37/checkpoint_000001)
[36m(train_using_tune pid=187326)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_15-09-37/train_using_tune_5cb3d_00000_0_batch_size=64,lr=0.0173,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-09-37/checkpoint_000002)
[36m(train_using_tune pid=187326)[0m Checkpoint

üèÜ Best trial config: {'lr': 0.017300500849656873, 'batch_size': 64, 'optimizer': 'sgd', 'weight_decay': 2.4802550795418668e-05, 'max_num_epochs': 30, 'num_trials': 5, 'device': 'cuda'}
üèÜ Best val_loss: 0.48939231038093567
üèÜ Best val_accuracy: 0.7391304347826086
‚úÖ Best trial test set accuracy: 0.8017


[36m(train_using_tune pid=189072)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-28_15-09-37/train_using_tune_5cb3d_00004_4_batch_size=32,lr=0.0039,optimizer=adam,weight_decay=0.0000_2026-01-28_15-09-37/checkpoint_000000)


In [18]:
best_result = tuner.get_results().get_best_result("val_loss", "min")
best_result

Result(
  metrics={'train_loss': 0.47168173509485584, 'train_accuracy': 0.776536312849162, 'val_loss': 0.4854586720466614, 'val_accuracy': 0.7391304347826086},
  path='/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/ray_tune_diabetes/train_using_tune_854db_00000_0_batch_size=32,lr=0.0089,optimizer=sgd,weight_decay=0.0000_2026-01-28_15-03-36/checkpoint_000029)
)