In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A simple training pipeline
This notebook is a demonstration of how to build a simple neural network from scratch and train it on a real life dataset and use the model to perform predictions

## Import tools

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

## Load the dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df.shape

(569, 33)

In [5]:
df.drop(columns=["id", "Unnamed: 32"], inplace=True)

In [6]:
# After dropping the `id` and `Unnamed: 32` columns
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

## Scaling the data

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Label Encoding
Since the output comprises of values `M` and `B`, we need to label encode it since our model would not understand letters

In [9]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

## Numpy arrays to PyTorch tensors

In [27]:
X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

## Defining the model

### Important parameters
Define the `learning_rate` and the number of `epochs` to train your model with

In [28]:
learning_rate = 0.1
epochs = 25

In [29]:
class MySimpleNN():
    
    def __init__(self, X):
        self.X = X
        self.weights = torch.rand((X.shape[1], 1), dtype=torch.float64, requires_grad=True)
        self.bias = torch.rand(1, dtype=torch.float64, requires_grad=True)

    def forward(self, X):
        z = torch.matmul(X, self.weights) + self.bias
        y_pred = torch.sigmoid(z)
        return y_pred

    def loss(self, y_pred, y_train):
        # clamp to prevent log(0)
        EPSILON = 1e-7
        y_pred = torch.clamp(y_pred, EPSILON, 1 - EPSILON)
        loss = - (y_train * torch.log(y_pred) + (1 - y_train) * torch.log(y_pred)).mean()
        return loss

Training pipeline is defined as:
```
For loop from 1 to epochs:
    Forward pass
    Calculate loss
    Backward pass
    Update parameters
```

In [30]:
model = MySimpleNN(X_train_tensor)

In [31]:
for i in range(1, epochs + 1):

    # forward pass
    y_pred = model.forward(X_train_tensor)

    # calculate loss
    loss = model.loss(y_pred, y_train_tensor)

    # backward pass
    loss.backward()

    # update parameters
    with torch.no_grad():
        model.weights -= learning_rate * model.weights.grad
        model.bias -= learning_rate * model.bias.grad

    # zero gradients
    model.weights.grad.zero_()
    model.bias.grad.zero_()

    print(f"Epoch {i} / {epochs}: Loss: {loss.item()}")

Epoch 1 / 25: Loss: 3.025738525584375
Epoch 2 / 25: Loss: 2.7856547006638657
Epoch 3 / 25: Loss: 2.5489134023313205
Epoch 4 / 25: Loss: 2.3162605249766295
Epoch 5 / 25: Loss: 2.088636913312271
Epoch 6 / 25: Loss: 1.8672274232724744
Epoch 7 / 25: Loss: 1.6535146203218625
Epoch 8 / 25: Loss: 1.4493231192053968
Epoch 9 / 25: Loss: 1.2568242026078285
Epoch 10 / 25: Loss: 1.0784528216588718
Epoch 11 / 25: Loss: 0.9166848998357351
Epoch 12 / 25: Loss: 0.77365573517568
Epoch 13 / 25: Loss: 0.6506903322742491
Epoch 14 / 25: Loss: 0.5479341088358067
Epoch 15 / 25: Loss: 0.46428886136088837
Epoch 16 / 25: Loss: 0.39768405355499553
Epoch 17 / 25: Loss: 0.3455208029782115
Epoch 18 / 25: Loss: 0.30510207683663876
Epoch 19 / 25: Loss: 0.2739415274238381
Epoch 20 / 25: Loss: 0.24992327193456282
Epoch 21 / 25: Loss: 0.23133942666649085
Epoch 22 / 25: Loss: 0.21685488732416552
Epoch 23 / 25: Loss: 0.2054441526868665
Epoch 24 / 25: Loss: 0.19632759319846943
Epoch 25 / 25: Loss: 0.1889177251276414


## Evaluation

In [32]:
with torch.no_grad():
    y_pred_test = model.forward(X_test_tensor)
    y_pred_test = (y_pred_test > 0.9).float()
    accuracy = (y_pred_test == y_test_tensor).float().mean()
    print(f"Accuracy: {accuracy.item()}")

Accuracy: 0.5603262782096863


## Defining an nn.Module model

In [33]:
import torch.nn as nn

In [40]:
class NNModel(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.linear = nn.Linear(num_features, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, features):
        out = self.linear(features)
        out = self.sigmoid(out)
        return out

In [41]:
learning_rate = 0.1
epochs = 25

In [55]:
loss_function = nn.BCELoss()

In [58]:
model = NNModel(X_train_tensor.shape[1])
optimizer = torch.optim.SGD(model.parameters, lr=learning_rate)
for epoch in range(epochs):
    y_pred = model(X_train_tensor)
    loss = loss_function(y_pred, y_train_tensor.view(-1, 1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")
    

TypeError: 'method' object is not iterable

In [43]:
with torch.no_grad():
    y_pred_test = model.forward(X_test_tensor)
    y_pred_test = (y_pred_test > 0.9).float()
    accuracy = (y_pred_test == y_test_tensor).float().mean()
    print(f"Accuracy: {accuracy.item()}")

Accuracy: 0.6228070259094238
