# Previous model issues
Previous model use batch gradient descent.

1. Inefficient in memory

2. Better coverages

Solution: use batches of data. means don't load entire data at a time. rather load data batch wise. example: 1000 rows. divide this data into 10 batches means each batch size is 100. so load data into mini-batch. it is very efficient.



use two loop. one for epoch and others for batches

In [63]:
# simple program

"""
batch_size = 32
epoch = 25

n_samples = len(X_train_tensor)


for epoch in range(epochs):
  # simple loop over the dataset in chunks of 'batch_size'
  for start_idx in range(0, n_samples, batch_size):
    X_batch = X_train_tensor[start_idx:end_idx]
    y_batch = y_train_tensor[start_idx:end_idx]


    # forward pass
    y_pred = model(X_batch)
    loss = loss_function(y_pred, y_batch.view(-1, 1))


    # update step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f"Epoch: {epoch}, Loss: {loss.item()}")

"""

"""
but there are some issues with this above code, are:
1. No standard interface for data
2. No easy way to apply transformations
3. Shuffling and sampling
4. Batch management & Parallelization


Solution: Dataset and Dataloader class
"""

'\nbut there are some issues with this above code, are:\n1. No standard interface for data\n2. No easy way to apply transformations\n3. Shuffling and sampling\n4. Batch management & Parallelization\n\n\nSolution: Dataset and Dataloader class\n'

# How Dataset and DataLoader class works

Dataset: it basically load the one by one row and send to DataLoader. DataLoader then make batch if it meets the batch size. before Dataset is loading there have some working mechanism. it shuffle the total data index and make group of random index arr. then based on this array of indexes, Dataset is loading those random rows one by one from memory and put to DataLoader class. then DataLoader class is received one by one till batch size is meet. then this batch is going to training process.

# Example of Dataset and Dataloader class

In [64]:
from sklearn.datasets import make_classification
import torch

In [65]:
# Step 1: Create a synthetic classification datset using sklearn
X, y = make_classification(
    n_samples=10, # number of samples
    n_features=2, # number of features
    n_informative=2, # number of informative features
    n_redundant=0, # number of redundant features
    n_classes=2, # number of classes
    random_state=42 # for reproducibility
)


In [66]:
print(X)
X.shape

print(y.shape)

[[ 1.06833894 -0.97007347]
 [-1.14021544 -0.83879234]
 [-2.8953973   1.97686236]
 [-0.72063436 -0.96059253]
 [-1.96287438 -0.99225135]
 [-0.9382051  -0.54304815]
 [ 1.72725924 -1.18582677]
 [ 1.77736657  1.51157598]
 [ 1.89969252  0.83444483]
 [-0.58723065 -1.97171753]]
(10,)


In [67]:
# convert the data to pytorch tensor
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

X

tensor([[ 1.0683, -0.9701],
        [-1.1402, -0.8388],
        [-2.8954,  1.9769],
        [-0.7206, -0.9606],
        [-1.9629, -0.9923],
        [-0.9382, -0.5430],
        [ 1.7273, -1.1858],
        [ 1.7774,  1.5116],
        [ 1.8997,  0.8344],
        [-0.5872, -1.9717]])

In [68]:
# import Dataset and Dataloader
from torch.utils.data import Dataset, DataLoader

In [69]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels


  def __len__(self):
    return self.features.shape[0]


  def __getitem__(self, index):
    # we can apply transformation here
    """
    resize, white and black, data augmentation
    """
    return self.features[index], self.labels[index]


In [70]:
# make a object of custom dataset
dataset = CustomDataset(X, y)

# length
print(len(dataset))

# 1st row
print(dataset[0])

10
(tensor([ 1.0683, -0.9701]), tensor(1))


In [71]:
# dataloader object or iterator
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [72]:
for batch_features, batch_labels in dataloader:
  print(batch_features)
  print(batch_labels)
  print("_",+50)

tensor([[ 1.7273, -1.1858],
        [-2.8954,  1.9769]])
tensor([1, 0])
_ 50
tensor([[-0.9382, -0.5430],
        [-1.1402, -0.8388]])
tensor([1, 0])
_ 50
tensor([[ 1.0683, -0.9701],
        [-0.5872, -1.9717]])
tensor([1, 0])
_ 50
tensor([[-0.7206, -0.9606],
        [ 1.7774,  1.5116]])
tensor([0, 1])
_ 50
tensor([[-1.9629, -0.9923],
        [ 1.8997,  0.8344]])
tensor([0, 1])
_ 50


# A note about samplers
In PyTorch, the smapler in the DataLoader determines the strategy for selecting samples from the dataset during data loading. It controls how indices of the dataset are drawn for eadh batch.

# A note about collate_fn
we know that, collate_fn make batch after getitem return row by row to dataloader. by default collate do it, but collate_fn gives us powerfull customization.

we can add *padding* using collate_fn

# Training on breast cancer dataset using mini-batch Gradient Descent

In [73]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [74]:
df = pd.read_csv("https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv")

print(df.shape)
df.head()

(569, 33)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [75]:
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Train Test Split

In [76]:
# import necessary libraries and methods
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.3)

# StandardScaler on dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
print(X_train)

# LabelEncoder on output
labelEncoder = LabelEncoder()
y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.fit_transform(y_test)
print(y_train)

[[ 3.11245398e-01 -1.42610963e+00  2.45824247e-01 ... -4.79005036e-01
  -6.77554896e-01 -8.79404964e-01]
 [-3.86167087e-01 -1.38868328e+00 -3.92770588e-01 ... -4.43782254e-01
  -5.57732703e-01 -6.02556278e-02]
 [ 1.21824723e-01 -1.24599533e+00  8.82542976e-02 ... -9.92641390e-02
  -4.92375143e-01 -3.04292720e-01]
 ...
 [-1.25233617e+00 -4.36738691e-02 -1.23924135e+00 ... -1.02155682e+00
   4.25742962e-01 -1.94668834e-01]
 [ 2.56715204e-01 -8.88105866e-01  2.37509210e-01 ...  3.69365153e-01
   4.91589257e-02 -2.10383963e-04]
 [-5.00967496e-01 -1.79344384e-01 -5.29137193e-01 ... -4.39095961e-01
   9.42867647e-02 -6.48588843e-01]]
[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1
 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0
 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 1 0

# convert numpy to tensor

In [77]:
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

print(X_train_tensor.shape)
print(y_train_tensor.shape)

torch.Size([398, 30])
torch.Size([398])


# CustomDataset class


In [78]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
    return self.features[index], self.labels[index]


In [79]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)
train_dataset[10]

(tensor([-0.3517, -0.2542, -0.3030, -0.4592,  1.7867,  1.2005, -0.4803,  0.1282,
         -0.3731,  1.4770,  0.1633, -0.0454,  0.2067, -0.1228, -0.5898, -0.0664,
         -0.6162,  0.1833, -0.2715, -0.1469, -0.2532, -0.2095, -0.2291, -0.3588,
          0.6023,  0.2890, -0.6887, -0.0584, -0.5297,  0.2923]),
 tensor(0.))

In [80]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Defining the model

In [81]:
class MySimpleNN(nn.Module):
  def __init__(self, num_features):

    super().__init__()
    self.linear = nn.Linear(num_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):
    out = self.linear(features)
    out = self.sigmoid(out)
    return out

# Important parameters

In [82]:
learning_rate = 0.1
epochs = 25

print(X_train_tensor.shape)

torch.Size([398, 30])


In [83]:
# create model
model = MySimpleNN(X_train_tensor.shape[1])

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# define loss funtion
loss_function = nn.BCELoss()

# Training pipeline

In [84]:
# define loop
for epoch in range(epochs):

  for batch_features, batch_labels in train_loader:

    # forward pass
    y_pred = model(batch_features)

    # loss calculation
    loss = loss_function(y_pred, batch_labels.view(-1, 1))

    # clear gradients
    optimizer.zero_grad()

    # backward pass
    loss.backward()

    # parameter update
    optimizer.step()

    # print loss in each epoch
    print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 0.7512136697769165
Epoch: 0, Loss: 0.5193164944648743
Epoch: 0, Loss: 0.5004804730415344
Epoch: 0, Loss: 0.36741310358047485
Epoch: 0, Loss: 0.3064599633216858
Epoch: 0, Loss: 0.3418619930744171
Epoch: 0, Loss: 0.3099042475223541
Epoch: 0, Loss: 0.2591361105442047
Epoch: 0, Loss: 0.27880534529685974
Epoch: 0, Loss: 0.41776397824287415
Epoch: 0, Loss: 0.32786431908607483
Epoch: 0, Loss: 0.2518511712551117
Epoch: 0, Loss: 0.1742391586303711
Epoch: 1, Loss: 0.2257770448923111
Epoch: 1, Loss: 0.26974520087242126
Epoch: 1, Loss: 0.2235575020313263
Epoch: 1, Loss: 0.24522151052951813
Epoch: 1, Loss: 0.1941608041524887
Epoch: 1, Loss: 0.22483980655670166
Epoch: 1, Loss: 0.20809413492679596
Epoch: 1, Loss: 0.16432833671569824
Epoch: 1, Loss: 0.14975605905056
Epoch: 1, Loss: 0.10980750620365143
Epoch: 1, Loss: 0.17955522239208221
Epoch: 1, Loss: 0.17432358860969543
Epoch: 1, Loss: 0.21691611409187317
Epoch: 2, Loss: 0.10551977902650833
Epoch: 2, Loss: 0.2182556688785553
Epoch: 2

# Evaluation Metrix

In [85]:
# Model evaluation using test_loader
model.eval() # Set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
  for batch_features, batch_labels in test_loader:
    # forward pass
    y_pred = model(batch_features)
    y_pred = (y_pred > 0.8).float() # convert probabilities to binary prediction

    # calculate accuracy for the current batch
    batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
    accuracy_list.append(batch_accuracy)


# calculate oeverall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f"Accuracy: {overall_accuracy:.4f}")

Accuracy: 0.9427
