In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


import dc_stat_think as dcst

## part 1 
1. Load the dataset using pandas and explore the various features of the dataset
2. Plot chart of the labels values
3. Create a heatmap of different features from the dataset.

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.Outcome.unique()

array([1, 0])

## part 2
1. Build ANN models with 3 layers, 4 layers, and 5 layers, and note the accuracies.
2. Split the test set into 30%, 40%, and 10% and build the model.
3. Build a model with and without Dropout and BatchNorm and check the
difference inaccuracies.
4. Check the change inaccuracies for when the “NaN” in the data frame is
replaced by “0”, mean, and when completely removed.
5. Train the model with any 3 different batch sizes and check the
accuracies.
6. Train models for 10, 50, 100 , 200 epocs.


In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [12]:
 
class BinaryClassification(nn.Module): 
    def __init__(self):
    
        super(BinaryClassification, self).__init__()
        # Number of input features is 12. 
        self.layer_1 = nn.Linear(8, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs)) 
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
    
        return x

In [13]:
device = torch.device('cpu')

In [14]:
model = BinaryClassification()
model.to(device)

print(model)

BinaryClassification(
  (layer_1): Linear(in_features=8, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [15]:
df.iloc[:,:-1].astype('float32')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.599998,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.600000,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.299999,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.100000,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.099998,2.288,33.0
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.900002,0.171,63.0
764,2.0,122.0,70.0,27.0,0.0,36.799999,0.340,27.0
765,5.0,121.0,72.0,23.0,112.0,26.200001,0.245,30.0
766,1.0,126.0,60.0,0.0,0.0,30.100000,0.349,47.0


In [6]:
df.iloc[:,:-1].to_numpy().astype('float32')

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]],
      dtype=float32)

In [16]:
class Dataset:
    def __init__(self, path, header = 'infer'):
        '''
        Reads a csv dataset with the assumption that the last column is a categorical label column.
        '''
        self.df = pd.read_csv(path, header = header)
        
        self.data = self.df.values[:, :-1]
        self.data = self.data.astype('float32')
        
        self.labels = self.df.values[:, -1]
        
        # If label is not a number, one-hot encode them
        if not np.issubdtype(self.labels.dtype, np.number):
            self.label_names = []
            for idx, name in enumerate(set(self.labels)):
                self.label_names.append(name)
                self.labels[self.labels == name] = idx
            self.labels = self.labels.astype('float32')
            
        self.labels = self.labels.reshape(-1, 1)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (self.data[idx], self.labels[idx])
    
    def __repr__(self):
        return repr(self.df)
    
    def split_data(self, test_ratio = 0.3):
        '''
        Splits data into training and test sets.
        '''
        test_len = round(test_ratio * len(self.data))
        train_len = len(self.data) - test_len
        
        return random_split(self, [train_len, test_len])

In [18]:

EPOCHS = 50 
BATCH_SIZE = 64 
LEARNING_RATE = 0.001

In [19]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [20]:
def binary_acc(y_pred,y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [181]:
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=45)

In [182]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [155]:
df.to_csv('no_columns.csv', header=False, index=False)

In [22]:
dataset = Dataset('no_columns.csv', header = None)

In [29]:
train_data, test_data = dataset.split_data(test_ratio = 0.3)


train_loader = DataLoader(train_data,
                          batch_size = 64
                         )
test_loader = DataLoader(test_data, batch_size = 64)

In [30]:
model.train()
for e in range(1,EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        x_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(x_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print(f'Epoch {e+0.03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
        

ValueError: Target size (torch.Size([64, 1, 1])) must be the same as input size (torch.Size([64, 1]))

## with orinial data 

In [183]:
dataset = Dataset('diabetes.csv', header = None)

ValueError: could not convert string to float: 'Pregnancies'