In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
import os

In [8]:
# Load the dataset using Pandas
data = pd.read_csv("data/diabetes.csv")
data

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Age,Class
0,6,148,72,35,0,33.6,50,positive
1,1,85,66,29,0,26.6,31,negative
2,8,183,64,0,0,23.3,32,positive
3,1,89,66,23,94,28.1,21,negative
4,0,137,40,35,168,43.1,33,positive
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,63,negative
764,2,122,70,27,0,36.8,27,negative
765,5,121,72,23,112,26.2,30,negative
766,1,126,60,0,0,30.1,47,positive


In [14]:
# x: extract out all but last column
# y: extract out only last column
# Convert both to numpy from csv using .values method
x = data.iloc[:,0:-1].values
y_string = list(data.iloc[:,-1].values)


In [40]:
# Turn y_string into 1s and 0s since our NN only understands numbers
y_int = []
for s in y_string:
    y_int.append(1) if s == "positive" else y_int.append(0)

# Convert list into numpy array
y = np.array(y_int, dtype = 'float64')

$x' = \frac{x-\mu}{\sigma}$

In [39]:
# Standardize data 
sc = StandardScaler() # Create object of standard scaler class from scikit learn
x = sc.fit_transform(x) # Fits and transforms x to be on standard normal distribution

In [41]:
# Convert into tensors
x = torch.tensor(x)
y = torch.tensor(y).unsqueeze(1) # Unsqueeze to make it have the same dimensions as x

print(x.shape)
print(y.shape)

torch.Size([768, 7])
torch.Size([768, 1])


In [43]:
class Dataset(Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index) -> T_co:
        return super().__getitem__(index)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [45]:
dataset = Dataset(x, y)
len(dataset)

768

In [47]:
# Load the data to your dataloader for batch processing and shuffling
train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=32, shuffle=True)

In [55]:
# Looking at the data loader
print(f"There is {len(train_loader)} batches in the dataset\n")
for x, y in train_loader:
    print("For one iteration (batch), there is:")
    print(f"Data: {x.shape}")
    print(f"Labels: {y.shape}")
    break

There is 24 batches in the dataset

For one iteration (batch), there is:
Data: torch.Size([32, 7])
Labels: torch.Size([32, 1])


Notice how x and y was originally [768, 7] and [768, 1] to 24 different batches of [32, 7] and [32, 1]

In [57]:
# Building the model
class Model(nn.Module):
    
    def __init__(self, input_features, output_features):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_features, 5)
        self.fc2 = nn.Linear(5, 4)
        self.fc3 = nn.Linear(4, 3)
        self.fc4 = nn.Linear(3, output_features)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

    def forward(self, x):
        out = self.fc1(x)
        out = self.tanh(out)
        out = self.fc2(out)
        out = self.tanh(out)
        out = self.fc3(out)
        out = self.tanh(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        return out

In [59]:
# Create the network
net = Model(7, 1)

# In Binary Cross Entropy: the input and output should have the same shape
# size_average = True --> the Losses are averaged over observations for each minibatch
criterion = torch.nn.BCELoss(reduction="mean")

# We will use SGD with momentum with a learning rate of 0.1
optimizer = torch.optim.SGD(net.parameters(), lr = 0.1, momentum=0.9)