# Get the diabete dataset

In [1]:
import requests

In [2]:
dataset_file='diabetes.csv'
url = 'https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv'

response = requests.get(url)

if response.status_code == 200:
    with open(dataset_file, 'wb') as f:
        f.write(response.content)
    print(f'File downloaded successfully to: {dataset_file}')
else:
    print(f'Failed to download the file. Status code: {response.status_code}')

File downloaded successfully to: diabetes.csv


# Deep learning: MLP
Below code are copied from https://www.pluralsight.com/guides/machine-learning-neural-networks-scikit-learn

For detail explaination, please see that page

# Looking at the dataset

In [3]:
import pandas as pd  # Package for analysis, manipulation of Tabular data
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [4]:
# Read the csv into a "DataFrame", aka Table
df_org = pd.read_csv(dataset_file) 
df_org

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Normalization

In [5]:
df = pd.DataFrame(columns = df_org.columns)
target_column = ['Outcome'] 
predictors = list(set(list(df_org.columns))-set(target_column))
df[predictors] = df_org[predictors]/df_org[predictors].max()
df[target_column] = df_org[target_column]
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,0.22618,0.19821,0.0,0.058824,0.176471,0.352941,1.0
Glucose,768.0,0.60751,0.160666,0.0,0.497487,0.58794,0.704774,1.0
BloodPressure,768.0,0.566438,0.158654,0.0,0.508197,0.590164,0.655738,1.0
SkinThickness,768.0,0.207439,0.161134,0.0,0.0,0.232323,0.323232,1.0
Insulin,768.0,0.094326,0.136222,0.0,0.0,0.036052,0.150414,1.0
BMI,768.0,0.47679,0.117499,0.0,0.406855,0.4769,0.545455,1.0
DiabetesPedigreeFunction,768.0,0.19499,0.136913,0.032231,0.100723,0.153926,0.258781,1.0
Age,768.0,0.410381,0.145188,0.259259,0.296296,0.358025,0.506173,1.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [6]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
y_train = y_train.ravel()
y_test = y_test.ravel()
print(X_train.shape); print(X_test.shape)

(537, 8)
(231, 8)


In [7]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

# Analysis

In [8]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))

[[319  39]
 [ 75 104]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       358
           1       0.73      0.58      0.65       179

    accuracy                           0.79       537
   macro avg       0.77      0.74      0.75       537
weighted avg       0.78      0.79      0.78       537



In [9]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[124  18]
 [ 41  48]]
              precision    recall  f1-score   support

           0       0.75      0.87      0.81       142
           1       0.73      0.54      0.62        89

    accuracy                           0.74       231
   macro avg       0.74      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231



In [10]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[124  18]
 [ 41  48]]
              precision    recall  f1-score   support

           0       0.75      0.87      0.81       142
           1       0.73      0.54      0.62        89

    accuracy                           0.74       231
   macro avg       0.74      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231



# Using linear regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
# Create a logistic regression model
model = LogisticRegression()

# Fit the model
model.fit(X_train, y_train)

# Predict the test set results
predict_test = model.predict(X_test)

In [13]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[129  13]
 [ 46  43]]
              precision    recall  f1-score   support

           0       0.74      0.91      0.81       142
           1       0.77      0.48      0.59        89

    accuracy                           0.74       231
   macro avg       0.75      0.70      0.70       231
weighted avg       0.75      0.74      0.73       231



**Note**: linear regression is give you similar accuracy and precision !

The main difference between Linear regression compare to MLP: 
    we can actually investigate further what factor have the most impact on the prediction

In [14]:
# Get the coefficients and feature names
coefficients = model.coef_[0]
feature_names = df.columns[:-1]  # Assuming the last column is the target variable

# Create a dictionary to store feature names and their corresponding coefficients
coefficients_dict = dict(zip(feature_names, coefficients))

# Sort the dictionary by absolute coefficient values
coefficients_dict_sorted = {k: v for k, v in sorted(coefficients_dict.items(), key=lambda item: abs(item[1]), reverse=True)}

# Print the sorted coefficients
print("Feature Impact:")
for feature, coef in coefficients_dict_sorted.items():
    print(f"{feature}: {coef:.3f}")


Feature Impact:
Pregnancies: 4.199
SkinThickness: 2.671
DiabetesPedigreeFunction: 1.860
BloodPressure: 1.472
BMI: 0.988
Age: -0.689
Insulin: 0.686
Glucose: -0.406


# Trim down input 

In [15]:
predictors=['Pregnancies','BMI','Insulin','BloodPressure']

In [16]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
y_train = y_train.ravel()
y_test = y_test.ravel()
print(X_train.shape); print(X_test.shape)

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

(537, 4)
(231, 4)
[[121  21]
 [ 61  28]]
              precision    recall  f1-score   support

           0       0.66      0.85      0.75       142
           1       0.57      0.31      0.41        89

    accuracy                           0.65       231
   macro avg       0.62      0.58      0.58       231
weighted avg       0.63      0.65      0.62       231



Yup, our accuracy and precission did go down but mostly still good because the linear model gave us the hint of 
what actually matter, what not

# More layer, more Neuron the better ?
We trained above with `(8,8,8)` : 3 layers with 8 neurons in each layer

We now try a big one `(20,30,50,30,20)`

In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(20,30,50,30,20), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[123  19]
 [ 64  25]]
              precision    recall  f1-score   support

           0       0.66      0.87      0.75       142
           1       0.57      0.28      0.38        89

    accuracy                           0.64       231
   macro avg       0.61      0.57      0.56       231
weighted avg       0.62      0.64      0.60       231



Nope: still the same performance but much slower to train ...