In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

In [92]:
diabetes_data = pd.read_csv('Diabetes.csv')

In [93]:
print(diabetes_data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [94]:
print("\nDataset shape:", diabetes_data.shape)


Dataset shape: (768, 9)


In [95]:
print("Missing values:\n", diabetes_data.isnull().sum())


Missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [96]:
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)

print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)

Training Set Shape: (691, 8)
Testing Set Shape: (77, 8)


**Simple Decision Tree**

In [97]:
tree_default = DecisionTreeClassifier(random_state=42)
tree_default.fit(X_train, y_train)

basic_preds = tree_default.predict(X_test)

def display_metrics(y_true, y_pred, model_title):
    print(f"\nResults for {model_title}:")
    print(f"Accuracy      : {accuracy_score(y_true, y_pred):.3f}")
    print(f"Precision     : {precision_score(y_true, y_pred):.3f}")
    print(f"Recall        : {recall_score(y_true, y_pred):.3f}")
    print(f"F1-Score      : {f1_score(y_true, y_pred):.3f}")
    print("\nDetailed Report:\n")
    print(classification_report(y_true, y_pred, digits=3))

display_metrics(y_test, basic_preds, "Basic Decision Tree")


Results for Basic Decision Tree:
Accuracy      : 0.740
Precision     : 0.640
Recall        : 0.593
F1-Score      : 0.615

Detailed Report:

              precision    recall  f1-score   support

           0      0.788     0.820     0.804        50
           1      0.640     0.593     0.615        27

    accuracy                          0.740        77
   macro avg      0.714     0.706     0.710        77
weighted avg      0.736     0.740     0.738        77



**Hyperparameter Tuning**

In [98]:
def tune_decision_tree(X, y):
    param_options = {
        'max_depth': [3, 5, 7, 9, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    search = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        param_grid=param_options,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )

    search.fit(X, y)
    return search

grid_result = tune_decision_tree(X_train, y_train)

print("\nBest Hyperparameters Found:")
print(grid_result.best_params_)

optimized_tree = grid_result.best_estimator_
optimized_tree.fit(X_train, y_train)

optimized_preds = optimized_tree.predict(X_test)

display_metrics(y_test, optimized_preds, "Optimized Decision Tree")


Best Hyperparameters Found:
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}

Results for Optimized Decision Tree:
Accuracy      : 0.779
Precision     : 0.708
Recall        : 0.630
F1-Score      : 0.667

Detailed Report:

              precision    recall  f1-score   support

           0      0.811     0.860     0.835        50
           1      0.708     0.630     0.667        27

    accuracy                          0.779        77
   macro avg      0.760     0.745     0.751        77
weighted avg      0.775     0.779     0.776        77

