## EDA


In [None]:
import pandas as pd


df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

## Feature engineering


In [None]:
from sklearn.model_selection import train_test_split


# Separate features and target (diabetes will be our target)
x = df[df.columns.drop("diabetes")]
y = df["diabetes"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


numerical_features = x_train.select_dtypes(include="number").columns.tolist()
print(f"There are {len(numerical_features)} numerical features: {numerical_features}\n")

string_features = x_train.select_dtypes(exclude="number").columns.tolist()
print(f"There are {len(string_features)} string features: {string_features}\n")

# Pipeline for numeric features
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), ("scale", StandardScaler())]
)

# Pipeline for string features
string_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# Now let's merge both pipeline into one single pre-processing object
# We can use ColumnTransformer for this
full_processor = ColumnTransformer(
    transformers=[
        ("number", numeric_pipeline, numerical_features),
        ("string", string_pipeline, string_features),
    ]
)

In [None]:
x_train = full_processor.fit_transform(x_train)
x_test = full_processor.transform(x_test)

print(x_train.shape, x_test.shape)

In [None]:
import pickle

with open("preprocessor.pkl", "wb") as f:
    pickle.dump(full_processor, f)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


rus = RandomUnderSampler(random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)
print(sorted(Counter(y_train).items()))

## Model definition and training


In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(64, 16),
    max_iter=1000,
    early_stopping=True,
    random_state=42,
)
mlp

In [None]:
mlp.fit(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = mlp.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")

In [None]:
with open("model.pkl", "wb") as f:
    pickle.dump(mlp, f)

## Model definition and training (Keras version)


In [None]:
import keras
from keras import layers


# Let's create a simple MLP with Keras
model = keras.Sequential()
model.add(layers.Dense(64, input_dim=x_train.shape[1], activation="relu"))
model.add(layers.Dense(16, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"],
)

In [None]:
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight

early_stopping = EarlyStopping(
    min_delta=0.001,
    patience=5,
    restore_best_weights=True,
)

# add class_weight to balance the classes from scikit-learn
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}
history = model.fit(
    x_train,
    y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.2,
    callbacks=[early_stopping],
    class_weight=class_weights,
)

In [None]:
# import matplotlib.pyplot as plt


# def plot_history(history):
#     # Plot training & validation accuracy values
#     plt.plot(history.history["accuracy"])
#     plt.plot(history.history["val_accuracy"])
#     plt.title("Model accuracy")
#     plt.ylabel("Accuracy")
#     plt.xlabel("Epoch")
#     plt.legend(["Train", "Test"], loc="upper left")
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(history.history["loss"])
#     plt.plot(history.history["val_loss"])
#     plt.title("Model loss")
#     plt.ylabel("Loss")
#     plt.xlabel("Epoch")
#     plt.legend(["Train", "Test"], loc="upper left")
#     plt.show()


# plot_history(history)

In [None]:
# evaluate precision and recall and f1-score
from sklearn.metrics import classification_report

y_pred_proba = model.predict(x_test)

In [None]:
y_pred = y_pred_proba > 0.95

print(classification_report(y_test, y_pred))

In [None]:
# show confusion matrix plotted with seaborn
import seaborn as sns
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test, y_pred, normalize="pred")
sns.heatmap(cm, annot=True)

In [None]:
model.save("model.keras")