# PyTorch Training Pipeline Using `Dataset` and `DataLoader` in `nn.Module`

# Import Libraries

In [61]:
import pandas as pd
import numpy as np

In [62]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [63]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [64]:
import kagglehub
import os

# Loading Dataset ...

In [65]:
path = kagglehub.dataset_download("ahmedesso/brest-cancer")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'brest-cancer' dataset.
Path to dataset files: /kaggle/input/brest-cancer


In [66]:
df = pd.read_csv('/root/.cache/kagglehub/datasets/ahmedesso/brest-cancer/versions/1/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


# Data Analysis

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [68]:
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [69]:
df.duplicated().sum()

np.int64(0)

## Drop columns

In [70]:
df.drop(labels=['id', 'Unnamed: 32'], axis=1, inplace=True, errors='ignore')
df.isnull().sum().sum()

np.int64(0)

In [71]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Train Test Split

In [72]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

X_train.shape, X_test.shape

((455, 30), (114, 30))

## Label Encoder

In [73]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

## Scale Data

In [74]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape

((455, 30), (114, 30))

In [75]:
X_train_scaled

array([[-6.15618727e-01,  1.29001280e+00, -5.99538702e-01, ...,
         8.78575670e-02, -5.02422353e-01, -7.49410125e-02],
       [-6.75751228e-02, -1.90713736e-04, -1.35161089e-01, ...,
        -4.99517635e-01, -8.99275604e-01, -4.30484541e-01],
       [-4.77890491e-01, -4.40915101e-01, -4.80938664e-01, ...,
        -1.38289475e-01,  2.09627101e-01, -3.98661201e-01],
       ...,
       [-1.11201424e+00, -1.60324004e+00, -1.09481914e+00, ...,
        -7.20200722e-01,  4.95792878e-02, -3.65191825e-01],
       [-3.54508947e-01, -7.58328001e-01, -3.43546367e-01, ...,
        -2.00517855e-01,  1.04906155e+00, -1.47915224e-01],
       [-4.92237183e-01, -9.59280053e-01, -5.57778125e-01, ...,
        -1.12149789e+00, -9.21378641e-03, -8.80400735e-01]])

# Convert Numpy to Tensor

In [76]:
# Features
X_train_tensor = torch.from_numpy(X_train_scaled).float()
X_test_tensor = torch.from_numpy(X_test_scaled).float()

# Label
y_train_tensor = torch.from_numpy(y_train).float()
y_test_tensor = torch.from_numpy(y_test).float()

In [77]:
X_train_tensor

tensor([[-6.1562e-01,  1.2900e+00, -5.9954e-01,  ...,  8.7858e-02,
         -5.0242e-01, -7.4941e-02],
        [-6.7575e-02, -1.9071e-04, -1.3516e-01,  ..., -4.9952e-01,
         -8.9928e-01, -4.3048e-01],
        [-4.7789e-01, -4.4092e-01, -4.8094e-01,  ..., -1.3829e-01,
          2.0963e-01, -3.9866e-01],
        ...,
        [-1.1120e+00, -1.6032e+00, -1.0948e+00,  ..., -7.2020e-01,
          4.9579e-02, -3.6519e-01],
        [-3.5451e-01, -7.5833e-01, -3.4355e-01,  ..., -2.0052e-01,
          1.0491e+00, -1.4792e-01],
        [-4.9224e-01, -9.5928e-01, -5.5778e-01,  ..., -1.1215e+00,
         -9.2138e-03, -8.8040e-01]])

# `Dataset` Class

In [78]:
class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, index):

    index_feature = self.features[index]
    index_label = self.labels[index]

    return index_feature, index_label

In [79]:
train_dataset = CustomDataset(features=X_train_tensor, labels=y_train_tensor)
test_dataset = CustomDataset(features=X_test_tensor, labels=y_test_tensor)

len(train_dataset), len(test_dataset)

(455, 114)

# `DataLoader` Class

In [80]:
train_dataloader = DataLoader(
    dataset = train_dataset,
    batch_size = 8,
    shuffle = True
  )

test_dataloader = DataLoader(
    dataset = test_dataset,
    batch_size = 8,
    shuffle = True
  )

# Custom Model

In [81]:
class MySimpleNN(nn.Module):

  def __init__(self, num_features):

    super().__init__()
    self.linear = nn.Linear(in_features=num_features, out_features=1)
    self.sigmoid = nn.Sigmoid()


  def forward(self, X):

    out = self.linear(X)
    out = self.sigmoid(out)

    return out

## Learning Rate and Epochs

In [82]:
epochs = 5
learning_rate = 0.01

## Model Creation

In [83]:
# Loss Fucntion
loss_function = nn.BCELoss()

# Create Model
model = MySimpleNN(X_train_tensor.shape[1])

# Optimizer
optimizer = torch.optim.SGD(params = model.parameters(), lr=learning_rate)

## Training Pipeline

In [84]:
# Loop for Epochs
for i in range(epochs):

  for batch_features, batch_labels in train_dataloader:

      # Calculate Y_pred
      y_pred = model(batch_features)

      # Calculate Loss
      loss = loss_function(y_pred, batch_labels.view(-1, 1))

      # Clear Gradient
      optimizer.zero_grad()

      # Backward Pass
      loss.backward()

      # Update Parameters
      optimizer.step()

  # Print Loss
  print(f'Epochs: {i+1}, Loss: {loss.item()}')

Epochs: 1, Loss: 0.35830897092819214
Epochs: 2, Loss: 0.4541926085948944
Epochs: 3, Loss: 0.18955421447753906
Epochs: 4, Loss: 0.07107676565647125
Epochs: 5, Loss: 0.25811532139778137


# Check Weights and Bias

In [85]:
model.linear.weight

Parameter containing:
tensor([[ 0.4104,  0.2089,  0.1063,  0.3686,  0.1489,  0.1373,  0.2261,  0.3149,
          0.0713, -0.2200,  0.0682, -0.1056,  0.0677,  0.3379, -0.0378, -0.0856,
          0.0840,  0.0082,  0.1262, -0.1142,  0.1716,  0.3321,  0.2488,  0.1199,
          0.2910,  0.0768,  0.0962,  0.3670,  0.1911,  0.2541]],
       requires_grad=True)

In [86]:
model.linear.bias

Parameter containing:
tensor([-0.2172], requires_grad=True)

# Evalation

In [87]:
# Model evaluation using test_loader
model.eval()  # Set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
    for batch_features, batch_labels in test_dataloader:
        # Forward pass
        y_pred = model(batch_features)
        y_pred = (y_pred > 0.5).float()  # Convert probabilities to binary predictions

        # Calculate accuracy for the current batch
        batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
        accuracy_list.append(batch_accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy: {overall_accuracy:.4f}')

Accuracy: 0.9667
