<h1>CLASSIFYING DIABETES PATIENTS USING MLP AND RIDGE CLASSIFICATION<h2>

# importing libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score as ap
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier

# Functions to calculate and compare accuracy scores

In [3]:
def compare_models(models):
    training_acc = [acc_score(model, tr, tr['Outcome']) for model in models.values()]
    validation_acc = [np.mean(cross_val_score(model, tr, tr['Outcome'], scoring=acc_score, cv=5)) 
                       for model in models.values()]
    test_acc = [acc_score(model, te, te['Outcome']) for model in models.values()]
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = training_acc, name="Training Accuracy"),
        go.Bar(x = names, y = validation_acc, name="CV Accuracy"),
        go.Bar(x = names, y = test_acc, name="Test Accuracy", opacity=.3)])
    fig.update_yaxes(title="ACCURACY COMPARISON")
    fig.write_image("comparisons.png")
    return fig

In [4]:
def phi(df):
    return df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

In [5]:
def acc_score(model, X, y):
    y_pred = model.predict(X)
    return ap(y_pred, y, normalize=True)

In [6]:
data = pd.read_csv('dataset_cleaned.csv')
tr, te = train_test_split(data, test_size=0.1, random_state=5)

In [7]:
quantitative_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", RidgeClassifier(alpha=10))
])
models = {}
alphas = np.linspace(0.5, 10, 30)
cv_values = []
train_values = []
test_values = []
for alpha in alphas:
    ridge_model.set_params(LinearModel__alpha=alpha)
    cv_values.append(np.mean(cross_val_score(ridge_model, tr, tr['Outcome'], scoring=acc_score, cv=5)))
    ridge_model.fit(tr, tr['Outcome'])
    train_values.append(acc_score(ridge_model, tr, tr['Outcome']))
    test_values.append(acc_score(ridge_model, te, te['Outcome']))

Adding the best model:

In [8]:
best_alpha = alphas[np.argmax(cv_values)]
ridge_model.set_params(LinearModel__alpha=best_alpha)
ridge_model.fit(tr, tr['Outcome'])
models["RidgeN(alpha_best)"] = ridge_model
y = ridge_model.predict(te);
print(ap(y, te['Outcome'], normalize=True))

0.8051948051948052


In [9]:
MLP_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", MLPClassifier())
])
models["MLPClassifier"] = MLP_model
MLP_model.fit(tr, tr['Outcome'])
y = MLP_model.predict(te)
print(ap(y, te['Outcome'], normalize=True))
compare_models(models)

0.6493506493506493



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.

