In [3]:
# create dummy dataset
import pandas as pd
import numpy as np 
import torch
from torch import Generator
from torch.utils.data import DataLoader,Dataset, dataloader,random_split

from dataclasses import dataclass

import lightning.pytorch as pl

import dagshub
import mlflow

import matplotlib.pyplot as plt

import tempfile
import os


# hp tunning library
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [4]:
!wget https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv

--2026-01-23 04:01:10--  https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23105 (23K) [text/plain]
Saving to: ‚Äòdiabetes.csv.6‚Äô


2026-01-23 04:01:10 (152 MB/s) - ‚Äòdiabetes.csv.6‚Äô saved [23105/23105]



In [23]:
from threading import stack_size


url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"

# Read the data
df = pd.read_csv(url)
df.head()


@dataclass
class DiabeticDataset(Dataset):
    X:torch.Tensor
    y:torch.Tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]

# normalization
class Normalization_dataset(Dataset):
    def __init__(self, base_dataset, mean, std):
        self.base_dataset = base_dataset
        self.mean = mean
        self.std = std

        # üî• preserve indices if base_dataset is a Subset
        if hasattr(base_dataset, "indices"):
            self.indices = base_dataset.indices

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        X, y = self.base_dataset[idx]
        X = (X - self.mean) / (self.std + 1e-8)
        return X, y

class DiabeticDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df,
        batch_size=16,
        train_ratio=0.7,
        val_ratio=0.15,
        seed=40
    ):
        super().__init__()
        self.df = df
        self.batch_size = batch_size
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.seed = seed

    def setup(self, stage=None):
        X = self.df.drop(columns="Outcome", axis=1).values
        y = self.df["Outcome"].values

        # convert into tensors
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.long)

        full_dataset = DiabeticDataset(X, y)

        n_total = len(full_dataset)
        n_train = int(self.train_ratio * n_total)
        n_val   = int(self.val_ratio * n_total)
        n_test  = n_total - n_train - n_val

        generator = torch.Generator().manual_seed(self.seed)

        self.train_ds, self.val_ds, self.test_ds = random_split(
            full_dataset,
            [n_train, n_val, n_test],
            generator=generator
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=False
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

    # ---------- Normalization (fit on train only) ----------
    def normalize_datasets(self):
        X_all = []

        for X, y in self.train_dataloader():
            X_all.append(X.cpu())

        X_all = torch.cat(X_all, dim=0)

        mean = X_all.mean(dim=0)
        std  = X_all.std(dim=0)

        # Wrap datasets
        self.train_ds = Normalization_dataset(self.train_ds, mean, std)
        self.val_ds   = Normalization_dataset(self.val_ds,   mean, std)
        self.test_ds  = Normalization_dataset(self.test_ds,  mean, std)

        return mean, std




In [6]:
dm = DiabeticDataModule(df=df, seed=36)
dm.setup()

mean, std = dm.normalize_datasets()

train_loader = dm.train_dataloader()
test_loader  = dm.test_dataloader()
valid_loader=dm.val_dataloader()

print("Mean:", mean)
print("Std:", std)




Mean: tensor([  3.8827, 120.9534,  69.1899,  19.9590,  77.6369,  31.8946,   0.4689,
         33.2737])
Std: tensor([  3.3960,  32.1912,  19.6791,  16.0622, 112.1099,   7.9516,   0.3178,
         11.5753])




In [7]:
# verify the dataset
type(dm.train_ds[0][1])
X,y=next(iter(train_loader))
X.shape,y.shape

(torch.Size([16, 8]), torch.Size([16]))

In [8]:
# Collect train data
X_train_list = []
y_train_list = []

for x, y in train_loader.dataset:
    X_train_list.append(x)
    y_train_list.append(y)

X_train = torch.stack(X_train_list, dim=0)  # (N_train, num_features)
y_train = torch.tensor(y_train_list)         # (N_train,)

# Collect test data
X_test_list = []
y_test_list = []

for x, y in test_loader.dataset:
    X_test_list.append(x)
    y_test_list.append(y)

X_test = torch.stack(X_test_list, dim=0)   # (N_test, num_features)
y_test = torch.tensor(y_test_list)    

from pathlib import Path
save_dir=Path.cwd().parent/'data'/'splits'
save_dir.mkdir(parents=True,exist_ok=True)

# File path
save_path = save_dir / "diabetes_normalized.pt"

torch.save({
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test
}, save_path)

In [9]:
# save split data inot csv and store in dvc
train_indices=dm.train_ds.indices
test_indices=dm.test_ds.indices
import pathlib
from pathlib import Path
data_dir=Path.cwd().parent/'data'
# Create 'splits' folder inside 'data' directory
splits_dir = data_dir / 'splits'

splits_dir.mkdir(parents=True, exist_ok=True)

df.iloc[train_indices].to_csv(splits_dir / 'train.csv', index=False)
df.iloc[test_indices].to_csv(splits_dir / 'test.csv', index=False)

# basic algo: logistic_regression

In [10]:
# basic algo: logistic Algorithm

import torch
import torch.nn as nn

class LogisticRgressionModel(nn.Module):
    def __init__(self, featur_dim):
        super().__init__()
        self.linear=nn.Linear(featur_dim,1)   # single output either 0 or 1
    
    def forward(self,x):
        return self.linear(x)

# setup model , loss and optimizer
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

featur_dim=8

model=LogisticRgressionModel(featur_dim=featur_dim)

lr=0.001

optimizer=torch.optim.Adam(model.parameters(),lr=lr)

criterion=nn.BCEWithLogitsLoss()


def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct=0
    total=0

    for X, y in loader:
        X = X.to(device)
        y = y.float().unsqueeze(1).to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # accuracy
        probs=torch.sigmoid(logits)
        predicts=(probs>0.5).long()
        correct += (predicts == y.long()).sum().item()
        total += y.size(0)


    return total_loss / len(loader),correct/total

def evaluate(model,loader,device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in loader:
            X = X.to(device)
            y = y.to(device)

            logits = model(X)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long().squeeze(1)

            correct += (preds == y).sum().item()
            total += y.size(0)

    return correct / total


In [11]:
import dagshub
dagshub.init(repo_owner='manikantmnnit', repo_name='diabetes_project', mlflow=True)


mlflow.set_tracking_uri('https://dagshub.com/manikantmnnit/diabetes_project.mlflow')

num_epochs = 50
mlflow.set_experiment("diabetes_logistic_regression")
with mlflow.start_run(run_name='log_reg_baseline'):
    mlflow.log_param("model", "logistic_regression")
    mlflow.log_param("optimizer", "Adam")
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param('Batch_size',num_epochs)

    for epoch in range(num_epochs):
        train_loss,train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        test_acc   = evaluate(model, test_loader, device)

        # ---- Log metrics per epoch ----
        mlflow.log_metric("train_log_loss", train_loss, step=epoch)
        mlflow.log_metric("train_accuracy", train_acc, step=epoch)
        mlflow.log_metric("test_accuracy", test_acc, step=epoch)

        if (epoch + 1) % 5 == 0:
            print(
                f"Epoch [{epoch+1}/{num_epochs}] | "
                f"Loss: {train_loss:.4f} | "
                f"Train Acc: {train_acc:.4f} | "
                f"Test Acc: {test_acc:.4f}"
            )
    
    # log model
    mlflow.pytorch.log_model(model,artifact_path='model')



Epoch [5/50] | Loss: 0.6694 | Train Acc: 0.5996 | Test Acc: 0.5517
Epoch [10/50] | Loss: 0.5999 | Train Acc: 0.7244 | Test Acc: 0.6810
Epoch [15/50] | Loss: 0.5635 | Train Acc: 0.7505 | Test Acc: 0.7328
Epoch [20/50] | Loss: 0.5381 | Train Acc: 0.7486 | Test Acc: 0.7586
Epoch [25/50] | Loss: 0.5234 | Train Acc: 0.7523 | Test Acc: 0.7586
Epoch [30/50] | Loss: 0.5093 | Train Acc: 0.7542 | Test Acc: 0.7414
Epoch [35/50] | Loss: 0.4999 | Train Acc: 0.7542 | Test Acc: 0.7759
Epoch [40/50] | Loss: 0.4951 | Train Acc: 0.7598 | Test Acc: 0.7759
Epoch [45/50] | Loss: 0.4919 | Train Acc: 0.7635 | Test Acc: 0.7759




Epoch [50/50] | Loss: 0.4879 | Train Acc: 0.7728 | Test Acc: 0.7845




üèÉ View run log_reg_baseline at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0/runs/3a992a4b3c6c4589bac8351bceb9cd37
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0


# Using Ray Tune for HP tunning

In [12]:
import ray
from ray import tune

In [31]:
from ray import tune
import torch
import torch.nn as nn
import os
import tempfile


def train_using_tune(config, train_loader, valid_loader, model_cls, feature_dim,df):

    device = config["device"]
    batch_size = int(config["batch_size"])
    lr = config["lr"]

    # üîπ Build DataModule INSIDE trial
    dm = DiabeticDataModule(
        df=df,
        batch_size=batch_size
    )
    dm.setup()
    dm.normalize_datasets()

    train_loader = dm.train_dataloader()
    valid_loader = dm.val_dataloader()
    test_loader  = dm.test_dataloader()

    # Build model
    model = model_cls(feature_dim).to(device)

    # üîπ optimizer
    if config["optimizer"] == "adam":
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=config["lr"],
            weight_decay=config["weight_decay"]
        )
    else:
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=config["lr"],
            momentum=0.9,
            weight_decay=config["weight_decay"]
        )
    criterion = nn.BCEWithLogitsLoss()

    # ----- Restore checkpoint if exists -----
    checkpoint = tune.get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as ckpt_dir:
            model_state, optimizer_state = torch.load(
                os.path.join(ckpt_dir, "checkpoint.pt"),
                map_location=device
            )
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    # ----- Training loop -----
    max_epochs = config["max_num_epochs"]
    for epoch in range(max_epochs):

        # ===== Train =====
        model.train()
        total_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for X, y in train_loader:
            X = X.to(device)
            y = y.float().unsqueeze(1).to(device)

            optimizer.zero_grad()
            logits = model(X) # forward
            loss = criterion(logits, y)

            loss.backward()  # back propagation
            optimizer.step()

            total_train_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()
            correct_train += (preds == y.long()).sum().item()
            total_train += y.size(0)

        train_loss = total_train_loss / len(train_loader)
        train_acc = correct_train / total_train

        # ===== Validation =====
        model.eval()
        total_val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for X, y in valid_loader:
                X = X.to(device)
                y = y.float().unsqueeze(1).to(device)

                logits = model(X)
                loss = criterion(logits, y)

                total_val_loss += loss.item()

                probs = torch.sigmoid(logits)
                preds = (probs > 0.5).long()
                correct_val += (preds == y.long()).sum().item()
                total_val += y.size(0)

        val_loss = total_val_loss / len(valid_loader)
        val_acc = correct_val / total_val

        # ===== Save checkpoint + report =====
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            ckpt_path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
            torch.save((model.state_dict(), optimizer.state_dict()), ckpt_path)

            checkpoint = tune.Checkpoint.from_directory(temp_checkpoint_dir)

            tune.report(
                {
                    "train_loss": train_loss,
                    "train_accuracy": train_acc,
                    "val_loss": val_loss,
                    "val_accuracy": val_acc,
                },
                checkpoint=checkpoint
            )


# obtain best accurac based on best model uisng checkpoint
def test_best_model(best_result, test_loader, model_cls, feature_dim):

    device = best_result.config["device"]

    # Build model
    best_trained_model = model_cls(feature_dim).to(device)

    # ----- Load best checkpoint -----
    checkpoint = best_result.checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        checkpoint_path = os.path.join(ckpt_dir, "checkpoint.pt")
        model_state, _ = torch.load(checkpoint_path, map_location=device)

    best_trained_model.load_state_dict(model_state)
    best_trained_model.eval()

    # ----- Test loop -----
    correct_test = 0
    total_test = 0

    with torch.no_grad():
        for X, y in test_loader:
            X = X.to(device)
            y = y.float().unsqueeze(1).to(device)

            logits = best_trained_model(X)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            correct_test += (preds == y.long()).sum().item()
            total_test += y.size(0)

    print(f"‚úÖ Best trial test set accuracy: {correct_test / total_test:.4f}")


# train the model, find best performing one and load the trained netqork from checkpoint file

def main(config, train_loader, valid_loader, test_loader, model_cls,df, feature_dim, gpus_per_trial=1):

    
    scheduler = ASHAScheduler(
        time_attr="training_iteration",
        max_t=config["max_num_epochs"],
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(
                train_using_tune,
                train_loader=train_loader,
                valid_loader=valid_loader,
                model_cls=model_cls,
                feature_dim=feature_dim,
                df=df
            ),
            resources={"cpu": 4, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            scheduler=scheduler,
            num_samples=config["num_trials"],
        ),
        param_space=config,
    )

    results = tuner.fit()

    best_result = results.get_best_result("val_loss", "min")

    print(f"üèÜ Best trial config: {best_result.config}")
    print(f"üèÜ Best trial final validation loss: {best_result.metrics['val_loss']}")
    print(f"üèÜ Best trial final validation accuracy: {best_result.metrics['val_accuracy']}")

    test_best_model(best_result, test_loader, model_cls, feature_dim)




In [32]:
config =  {
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([16, 32, 64]),
    "optimizer": tune.choice(["adam", "sgd"]),
    "weight_decay": tune.loguniform(1e-6, 1e-2),
    "max_num_epochs": 30,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "num_trials": 15,
}

main(
    config=config,
    df=df,
    train_loader=train_loader,
    valid_loader=valid_loader,
    test_loader=test_loader,
    model_cls=LogisticRgressionModel,
    feature_dim=8,
    gpus_per_trial=1 if torch.cuda.is_available() else 0
)

[36m(train_using_tune pid=150524)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-23_04-49-27/train_using_tune_e5eb4_00000_0_batch_size=16,lr=0.0002,optimizer=sgd,weight_decay=0.0024_2026-01-23_04-49-27/checkpoint_000000)
[36m(train_using_tune pid=150524)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-23_04-49-27/train_using_tune_e5eb4_00000_0_batch_size=16,lr=0.0002,optimizer=sgd,weight_decay=0.0024_2026-01-23_04-49-27/checkpoint_000001)
[36m(train_using_tune pid=150524)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-23_04-49-27/train_using_tune_e5eb4_00000_0_batch_size=16,lr=0.0002,optimizer=sgd,weight_decay=0.0024_2026-01-23_04-49-27/checkpoint_000002)
[36m(train_using_tune pid=150524)[0m Checkpoint

üèÜ Best trial config: {'lr': 0.09245450318626276, 'batch_size': 64, 'optimizer': 'adam', 'weight_decay': 3.834196588142649e-06, 'max_num_epochs': 30, 'device': 'cpu', 'num_trials': 15}
üèÜ Best trial final validation loss: 0.5341602861881256
üèÜ Best trial final validation accuracy: 0.7217391304347827
‚úÖ Best trial test set accuracy: 0.8103


[36m(train_using_tune pid=159275)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/teamspace/studios/this_studio/ray_results/train_using_tune_2026-01-23_04-49-27/train_using_tune_e5eb4_00014_14_batch_size=16,lr=0.0222,optimizer=adam,weight_decay=0.0000_2026-01-23_04-49-28/checkpoint_000001)[32m [repeated 2x across cluster][0m


In [18]:
best_result


NameError: name 'best_result' is not defined