In [22]:
# create dummy dataset
import pandas as pd
import numpy as np 
import torch
from torch import Generator
from torch.utils.data import DataLoader,Dataset, dataloader,random_split

from dataclasses import dataclass

import lightning.pytorch as pl

import dagshub
import mlflow


In [2]:
!wget https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv

--2026-01-20 14:03:31--  https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23105 (23K) [text/plain]
Saving to: ‚Äòdiabetes.csv.1‚Äô


2026-01-20 14:03:31 (85.1 MB/s) - ‚Äòdiabetes.csv.1‚Äô saved [23105/23105]



In [3]:
from threading import stack_size


url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"

# Read the data
df = pd.read_csv(url)
df.head()


@dataclass
class DiabeticDataset(Dataset):
    X:torch.Tensor
    y:torch.Tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]

# normalization
class Normalization_dataset(Dataset):
    def __init__(self, base_dataset, mean, std):
        self.base_dataset = base_dataset
        self.mean = mean
        self.std = std

        # üî• preserve indices if base_dataset is a Subset
        if hasattr(base_dataset, "indices"):
            self.indices = base_dataset.indices

    def __len__(self):
        return len(self.base_dataset)

    def __getitem__(self, idx):
        X, y = self.base_dataset[idx]
        X = (X - self.mean) / (self.std + 1e-8)
        return X, y

class DiabeticDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df,
        batch_size=16,
        train_ratio=0.75,
        seed=40
    ):
        super().__init__()
        self.df=df
        self.batch_size=batch_size
        self.train_ratio=train_ratio
        self.seed=seed

    def setup(self,stage=None):
        X=df.drop(columns='Outcome',axis=1).values
        y=df['Outcome'].values

        # convert into tensor
        X=torch.tensor(X,dtype=torch.float32)
        y=torch.tensor(y,dtype=torch.long)

        full_dataset=DiabeticDataset(X,y)

        train_size = int(self.train_ratio * len(full_dataset))
        test_size = len(full_dataset) - train_size
        generator=torch.Generator().manual_seed(self.seed)

        self.train_ds, self.test_ds = random_split(
            full_dataset,
            [train_size, test_size],
            generator=generator
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True
        )

    def normalize_datasets(self):
        X_all = []

        for X, y in self.train_dataloader():
            X_all.append(X.cpu())

        X_all = torch.cat(X_all, dim=0)

        mean = X_all.mean(dim=0)
        std  = X_all.std(dim=0)

        # Wrap datasets
        self.train_ds = Normalization_dataset(self.train_ds, mean, std)
        self.test_ds  = Normalization_dataset(self.test_ds,  mean, std)

        return mean, std




In [4]:
dm = DiabeticDataModule(df=df, seed=36)
dm.setup()

mean, std = dm.normalize_datasets()

train_loader = dm.train_dataloader()
test_loader  = dm.test_dataloader()

print("Mean:", mean)
print("Std:", std)




Mean: tensor([  3.8264, 121.0104,  69.0521,  20.2465,  78.5729,  31.9856,   0.4695,
         33.1042])
Std: tensor([  3.4117,  32.4606,  19.7235,  16.0761, 112.1771,   7.9008,   0.3158,
         11.5159])




In [5]:
# Collect train data
X_train_list = []
y_train_list = []

for x, y in train_loader.dataset:
    X_train_list.append(x)
    y_train_list.append(y)

X_train = torch.stack(X_train_list, dim=0)  # (N_train, num_features)
y_train = torch.tensor(y_train_list)         # (N_train,)

# Collect test data
X_test_list = []
y_test_list = []

for x, y in test_loader.dataset:
    X_test_list.append(x)
    y_test_list.append(y)

X_test = torch.stack(X_test_list, dim=0)   # (N_test, num_features)
y_test = torch.tensor(y_test_list)    

from pathlib import Path
save_dir=Path.cwd().parent/'data'/'splits'
save_dir.mkdir(parents=True,exist_ok=True)

# File path
save_path = save_dir / "diabetes_normalized.pt"

torch.save({
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test
}, save_path)

In [6]:
# save split data inot csv and store in dvc
train_indices=dm.train_ds.indices
test_indices=dm.test_ds.indices
import pathlib
from pathlib import Path
data_dir=Path.cwd().parent/'data'
# Create 'splits' folder inside 'data' directory
splits_dir = data_dir / 'splits'

splits_dir.mkdir(parents=True, exist_ok=True)

df.iloc[train_indices].to_csv(splits_dir / 'train.csv', index=False)
df.iloc[test_indices].to_csv(splits_dir / 'test.csv', index=False)

In [31]:
# basic algo: logistic Algorithm

import torch
import torch.nn as nn

class Logistic_RgressionModel(nn.Module):
    def __init__(self, featur_dim):
        super().__init__()
        self.linear=nn.Linear(featur_dim,1)   # single output either 0 or 1
    
    def forward(self,x):
        return self.linear(x)

# setup model , loss and optimizer
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

featur_dim=8

model=Logistic_RgressionModel(featur_dim=featur_dim)

lr=0.001

optimizer=torch.optim.Adam(model.parameters(),lr=lr)

criterion=nn.BCEWithLogitsLoss()


def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct=0
    total=0

    for X, y in loader:
        X = X.to(device)
        y = y.float().unsqueeze(1).to(device)  # (batch, 1)

        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # accuracy
        probs=torch.sigmoid(logits)
        predicts=(probs>0.5).long()
        correct += (predicts == y.long()).sum().item()
        total += y.size(0)


    return total_loss / len(loader),correct/total

def evaluate(model,loader,device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in loader:
            X = X.to(device)
            y = y.to(device)

            logits = model(X)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long().squeeze(1)

            correct += (preds == y).sum().item()
            total += y.size(0)

    return correct / total



#







In [34]:
import dagshub
dagshub.init(repo_owner='manikantmnnit', repo_name='diabetes_project', mlflow=True)


mlflow.set_tracking_uri('https://dagshub.com/manikantmnnit/diabetes_project.mlflow')

num_epochs = 50
mlflow.set_experiment("diabetes_logistic_regression")
with mlflow.start_run(run_name='log_reg_baseline'):
    mlflow.log_param("model", "logistic_regression")
    mlflow.log_param("optimizer", "Adam")
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param('Batch_size',num_epochs)

    for epoch in range(num_epochs):
        train_loss,train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        test_acc   = evaluate(model, test_loader, device)

        # ---- Log metrics per epoch ----
        mlflow.log_metric("train_log_loss", train_loss, step=epoch)
        mlflow.log_metric("train_accuracy", train_acc, step=epoch)
        mlflow.log_metric("test_accuracy", test_acc, step=epoch)

        if (epoch + 1) % 5 == 0:
            print(
                f"Epoch [{epoch+1}/{num_epochs}] | "
                f"Loss: {train_loss:.4f} | "
                f"Train Acc: {train_acc:.4f} | "
                f"Test Acc: {test_acc:.4f}"
            )
    
    # log model
    mlflow.pytorch.log_model(model,artifact_path='model')



Epoch [5/50] | Loss: 0.4762 | Train Acc: 0.7795 | Test Acc: 0.7865
Epoch [10/50] | Loss: 0.4762 | Train Acc: 0.7778 | Test Acc: 0.7865
Epoch [15/50] | Loss: 0.4762 | Train Acc: 0.7795 | Test Acc: 0.7865
Epoch [20/50] | Loss: 0.4761 | Train Acc: 0.7795 | Test Acc: 0.7865
Epoch [25/50] | Loss: 0.4761 | Train Acc: 0.7795 | Test Acc: 0.7865
Epoch [30/50] | Loss: 0.4762 | Train Acc: 0.7778 | Test Acc: 0.7865
Epoch [35/50] | Loss: 0.4761 | Train Acc: 0.7795 | Test Acc: 0.7865
Epoch [40/50] | Loss: 0.4762 | Train Acc: 0.7795 | Test Acc: 0.7917
Epoch [45/50] | Loss: 0.4761 | Train Acc: 0.7795 | Test Acc: 0.7917




Epoch [50/50] | Loss: 0.4761 | Train Acc: 0.7778 | Test Acc: 0.7917




üèÉ View run log_reg_baseline at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0/runs/58a978e2af6a4981b542105ade116416
üß™ View experiment at: https://dagshub.com/manikantmnnit/diabetes_project.mlflow/#/experiments/0
