# Get the diabete dataset

In [None]:
import requests

In [None]:
dataset_file='diabetes.csv'
url = 'https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv'

response = requests.get(url)

if response.status_code == 200:
    with open(dataset_file, 'wb') as f:
        f.write(response.content)
    print(f'File downloaded successfully to: {dataset_file}')
else:
    print(f'Failed to download the file. Status code: {response.status_code}')

# Deep learning: MLP
Below code are copied from https://www.pluralsight.com/guides/machine-learning-neural-networks-scikit-learn

For detail explaination, please see that page

# Looking at the dataset

In [None]:
import pandas as pd  # Package for analysis, manipulation of Tabular data
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [None]:
# Read the csv into a "DataFrame", aka Table
df_org = pd.read_csv(dataset_file) 
df_org

## Normalization

In [None]:
df = pd.DataFrame(columns = df_org.columns)
target_column = ['Outcome'] 
predictors = list(set(list(df_org.columns))-set(target_column))
df[predictors] = df_org[predictors]/df_org[predictors].max()
df[target_column] = df_org[target_column]
df.describe().transpose()

In [None]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
y_train = y_train.ravel()
y_test = y_test.ravel()
print(X_train.shape); print(X_test.shape)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

# Analysis

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))

In [None]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

In [None]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

# Using linear regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Create a logistic regression model
model = LogisticRegression()

# Fit the model
model.fit(X_train, y_train)

# Predict the test set results
predict_test = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

**Note**: linear regression is give you similar accuracy and precision !

The main difference between Linear regression compare to MLP: 
    we can actually investigate further what factor have the most impact on the prediction

In [None]:
# Get the coefficients and feature names
coefficients = model.coef_[0]
feature_names = df.columns[:-1]  # Assuming the last column is the target variable

# Create a dictionary to store feature names and their corresponding coefficients
coefficients_dict = dict(zip(feature_names, coefficients))

# Sort the dictionary by absolute coefficient values
coefficients_dict_sorted = {k: v for k, v in sorted(coefficients_dict.items(), key=lambda item: abs(item[1]), reverse=True)}

# Print the sorted coefficients
print("Feature Impact:")
for feature, coef in coefficients_dict_sorted.items():
    print(f"{feature}: {coef:.3f}")


# Trim down input 

In [None]:
predictors=['Pregnancies','BMI','Insulin','BloodPressure']

In [None]:
X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
y_train = y_train.ravel()
y_test = y_test.ravel()
print(X_train.shape); print(X_test.shape)

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

Yup, our accuracy and precission did go down but mostly still good because the linear model gave us the hint of 
what actually matter, what not

# More layer, more Neuron the better ?
We trained above with `(8,8,8)` : 3 layers with 8 neurons in each layer

We now try a big one `(20,30,50,30,20)`

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(20,30,50,30,20), activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

Nope: still the same performance but much slower to train ...