In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_uciml_pima_indians_diabetes_database_path = kagglehub.dataset_download('organizations/uciml/pima-indians-diabetes-database')

print('Data source import complete.')


# **Buildiniding a machine Learning model**
## **Pytorch to predict the outcome**

## 1. Importing Dependancies

In [None]:
# Libraries used
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import dataloader

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

## 2. Importing data

In [None]:
# Loading the data
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
# first 5 rows of the data
data.head()

In [None]:
# The shape of the data
print("#"*40)
print(f"The shape of the data is: {data.shape}\nWith {data.shape[0]} samples and {data.shape[1]} columns")
print("#"*40)

## 3. Splitting the data for machine learning purpose
- 500 samples will be training data
- 268 samples for testing data

In [None]:
# Getting a copy of the data
data_split = data.copy(deep=True)

# Training data
train = data_split.iloc[:500,:]

# Test data
test = data_split.iloc[500:,:]

# Printing the shape of the data
print("#"*40)
print(f"The shape of the train: {train.shape}\nThe shape of the test: {test.shape}")
print("#"*40)

## 4. Exploratory Data Analysis (EDA)
- On the training data

In [None]:
# View of the train data
train.head()

In [None]:
# Number of null values

print('#'*40)
print(f"The number of null values is: {train.isnull().sum().sum()}\n\n-> There are no null values in this train data.")
print('#'*40)

In [None]:
# Checking for duplicates in the data

print('#'*50)
print(f"The number of duplicate values in the train data is: {train.duplicated().sum()}\n\n-> There are no duplicate values in the train data")
print('#'*50)

In [None]:
# Distribution of the data

distribution = train['Outcome'].value_counts()

print(f"The distribution of the data:\n{distribution}")

In [None]:
# Plotting the distribution

labels = ["No Diabetes(0)", "With Diabetes(1)"]
sizes = [318, 182]

# Get Seaborn color palette
colors = sns.color_palette("pastel")[:2]
explode=[0, 0.05]

# Create pie chart
plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=140, explode=explode, wedgeprops={'edgecolor': 'black'})
plt.title("Distribution of Outcome", fontsize=12)
plt.show()

print('The data is not distributed uniformly between the two groups')

In [None]:
# Checking for outliers
cols = train.drop('Outcome', axis=1).columns.tolist()
fig, axes=plt.subplots(4, 2, figsize=(12,8))
axes = axes.flatten()
for i, col in enumerate(cols):
    train[col].plot(kind='box', vert=0, ax=axes[i], title=col)
fig.suptitle("Boxplots of Features in the Dataset", fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
print('#'*40)
print('Since my dataset is small, I will leave the outliers')
print('#'*40)

## 5. Splitting data into features and target

In [None]:
# Features
X = train.drop('Outcome', axis=1)

# Target
y = train.Outcome

In [None]:
# Checking the shape of splits
X.shape, y.shape

In [None]:
# Balancing the data
df_balance = train.copy(deep=True)
smote = SMOTE(sampling_strategy='auto', random_state=256)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Copying the data to avoid changing the original resampled data
X_balanced = X_resampled.copy(deep=True)
y_balanced = y_resampled.copy(deep=True)

# Creating a new DataFrame for the balanced data
df_balanced = X_balanced.copy()
df_balanced['Outcome'] = y_balanced


In [None]:
# Checking if the data is balanced
print(df_balanced['Outcome'].value_counts())

print('#'*30)
print('Now the data is balanced')
print('#'*30)

In [None]:
# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Conertion to tensors
X_balance_scaled = torch.FloatTensor(X.to_numpy())
y_balance = torch.FloatTensor(y).view(-1,1)


## 6. Model building

### 6.1. The model

In [None]:
# model

class DiabetesClassification(nn.Module):
    def __init__(self, in_feature=8, h1=8, h2=8, out_feature=1):
        super(DiabetesClassification, self).__init__()
        self.fc1 = nn.Linear(in_feature, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.out = nn.Linear(h2, out_feature)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.out(x)
        return self.sig(x)


In [None]:
# Setting a manual seed
torch.manual_seed(256)

# Initializing the model
model = DiabetesClassification()

# Setting loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.02)

### 6.2 Training

In [None]:
epochs = 100
losses = []

for epoch in range(epochs):
    # Get prediction
    pred = model.forward(X_balance_scaled)

    # Calculate the loss
    loss = criterion(pred, y_balance)
    losses.append(loss.detach().numpy())

    # Printing training output
    if epoch%10 ==0:
        print(f"Epoch: {epoch+1} Loss: {loss.item()} ")

    # Setting back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print('Training complete')

In [None]:
# PLotting the learning curve
plt.plot(range(epochs), losses)
plt.title('The performance')
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.show()

## 7. Evaluation

In [None]:
# The test data
test.head()

In [None]:
# Splitting test data
val = test.copy()

X_val = val.iloc[:,:-1].copy()
y_val = val.iloc[:,-1:].copy()

In [None]:
# Scaling the data
test_scaled = scaler.fit_transform(X_val)
X_test = torch.FloatTensor(test_scaled)
y_test = torch.FloatTensor(y_val.values)

In [None]:
X_test.shape, y_test.shape

In [None]:
model.eval()
with torch.no_grad():
    y_val = model(X_test)
    loss = criterion(y_val, y_test)
print(f"The loss is: {loss}")

In [None]:
total = len(y_test)

# Apply sigmoid to convert logits to probabilities

y_pred = (y_val>=0.5).float()

correct = (y_pred == y_test).sum().item()
total = y_test.shape[0]

accuracy = correct / total * 100
print(f"Correct Predictions: {correct}/{total}")
print(f"Accuracy: {accuracy:.2f}%")

