In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import warnings

In [2]:
df=pd.read_csv('data/diabetes.csv')

In [3]:
# Top five records
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Setting Up Validation Frame Work

In [30]:
# perfom train/validation/test using sklearn
from sklearn.model_selection import train_test_split, GridSearchCV

In [19]:
# Extract features (X) and target (y)
X = df.drop(columns='Outcome')

In [20]:
y = df['Outcome']

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.preprocessing import StandardScaler
# Apply StandardScaler to normalize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
X_train.shape, X_test.shape

((614, 8), (154, 8))

In [26]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [28]:
# Define the models and their hyperparameters
models = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            "C": [0.1, 1, 10, 100],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [10, 50, 100],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "Support Vector Machine": {
        "model": SVC(kernel='rbf'),
        "params": {
            "C": [0.1, 1, 10],
            "gamma": [0.001, 0.01, 0.1, 1]
        }
    },
    "K Neighbour": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [3, 5, 7, 9],
            "weights": ["uniform", "distance"]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "Neural Networks": {
        "model": MLPClassifier(hidden_layer_sizes=(100,)),
        "params": {
            "hidden_layer_sizes": [(50,), (100,), (100, 50)],
            "activation": ["relu", "tanh"],
            "learning_rate": ["constant", "adaptive"]
        }
    }
}


In [31]:
# Perform hyperparameter tuning using GridSearchCV
best_models = {}
for name, model_info in models.items():
    clf = GridSearchCV(model_info["model"], model_info["params"], cv=5, n_jobs=-1, scoring='accuracy')
    clf.fit(X_train_scaled, y_train)
    best_models[name] = clf.best_estimator_
    print(f"{name}: Best Params: {clf.best_params_}, Best Score: {clf.best_score_:.4f}")

Logistic Regression: Best Params: {'C': 10, 'solver': 'liblinear'}, Best Score: 0.7655
Decision Tree: Best Params: {'max_depth': 20, 'min_samples_split': 5}, Best Score: 0.7264
Random Forest: Best Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 10}, Best Score: 0.7834
Support Vector Machine: Best Params: {'C': 1, 'gamma': 0.1}, Best Score: 0.7687
K Neighbour: Best Params: {'n_neighbors': 9, 'weights': 'distance'}, Best Score: 0.7444
Gradient Boosting: Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}, Best Score: 0.7785
Naive Bayes: Best Params: {}, Best Score: 0.7525
Neural Networks: Best Params: {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}, Best Score: 0.7736




In [33]:
from sklearn.metrics import accuracy_score
# Evaluate the best models on the test set
for name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: Test Accuracy: {accuracy:.4f}")


Logistic Regression: Test Accuracy: 0.7532
Decision Tree: Test Accuracy: 0.7468
Random Forest: Test Accuracy: 0.7468
Support Vector Machine: Test Accuracy: 0.7338
K Neighbour: Test Accuracy: 0.6688
Gradient Boosting: Test Accuracy: 0.7662
Naive Bayes: Test Accuracy: 0.7662
Neural Networks: Test Accuracy: 0.7468
