In [1]:
# create dummy dataset
import pandas as pd
import numpy as np 
import torch
from torch import Generator
from torch.utils.data import DataLoader,Dataset, dataloader,random_split

from dataclasses import dataclass

import lightning.pytorch as pl


In [39]:
!wget https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv

--2026-01-18 14:13:05--  https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23105 (23K) [text/plain]
Saving to: ‘diabetes.csv’


2026-01-18 14:13:05 (100 MB/s) - ‘diabetes.csv’ saved [23105/23105]



In [33]:
url = "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"

# Read the data
df = pd.read_csv(url)
df.head()


@dataclass
class DiabeticDataset(Dataset):
    X:torch.Tensor
    y:torch.Tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]


class DiabeticDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df,
        batch_size=16,
        train_ratio=0.75,
        seed=40
    ):
        super().__init__()
        self.df=df
        self.batch_size=batch_size
        self.train_ratio=train_ratio
        self.seed=seed

    def setup(self,stage=None):
        X=df.drop(columns='Outcome',axis=1).values
        y=df['Outcome'].values

        # convert into tensor
        X=torch.tensor(X,dtype=torch.float32)
        y=torch.tensor(y,dtype=torch.long)

        full_dataset=DiabeticDataset(X,y)

        train_size = int(self.train_ratio * len(full_dataset))
        test_size = len(full_dataset) - train_size
        generator=torch.Generator().manual_seed(self.seed)

        self.train_ds, self.test_ds = random_split(
            full_dataset,
            [train_size, test_size],
            generator=generator
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=True
        )



dm=DiabeticDataModule(df=df,seed=36)

dm.setup()

train_loder=dm.train_dataloader()
test_loder=dm.test_dataloader()


tensor([[1.0000e+00, 1.1600e+02, 7.8000e+01, 2.9000e+01, 1.8000e+02, 3.6100e+01,
         4.9600e-01, 2.5000e+01],
        [1.0000e+00, 8.9000e+01, 6.6000e+01, 2.3000e+01, 9.4000e+01, 2.8100e+01,
         1.6700e-01, 2.1000e+01],
        [9.0000e+00, 1.0200e+02, 7.6000e+01, 3.7000e+01, 0.0000e+00, 3.2900e+01,
         6.6500e-01, 4.6000e+01],
        [7.0000e+00, 1.2500e+02, 8.6000e+01, 0.0000e+00, 0.0000e+00, 3.7600e+01,
         3.0400e-01, 5.1000e+01],
        [1.0000e+00, 1.0800e+02, 8.8000e+01, 1.9000e+01, 0.0000e+00, 2.7100e+01,
         4.0000e-01, 2.4000e+01],
        [2.0000e+00, 9.2000e+01, 7.6000e+01, 2.0000e+01, 0.0000e+00, 2.4200e+01,
         1.6980e+00, 2.8000e+01],
        [1.0000e+00, 7.7000e+01, 5.6000e+01, 3.0000e+01, 5.6000e+01, 3.3300e+01,
         1.2510e+00, 2.4000e+01],
        [6.0000e+00, 1.0900e+02, 6.0000e+01, 2.7000e+01, 0.0000e+00, 2.5000e+01,
         2.0600e-01, 2.7000e+01],
        [0.0000e+00, 1.3100e+02, 8.8000e+01, 0.0000e+00, 0.0000e+00, 3.1600e+01,

