In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
from sklearn.model_selection import GridSearchCV

# Function to record performance metrics to a CSV
def log_results_to_csv(model_name, hyperparameters, accuracy, confusion_matrix, classification_report, importances, filename='f_model_eval_log.csv'):
    results = {
        'Model': model_name,
        'Hyperparameters': str(hyperparameters),
        'Accuracy': accuracy,
        'Confusion Matrix': str(confusion_matrix),
        'Classification Report': str(classification_report),
        'Feature Importances': str(importances)
    }
    # Convert to DataFrame and append
    results_df = pd.DataFrame([results])
    
    try:
        results_df.to_csv(filename, mode='a', header=False, index=False)
    except FileNotFoundError:
        # If file doesn't exist, create it with headers
        results_df.to_csv(filename, mode='w', header=True, index=False)

# Load the diabetes dataset
engine = create_engine('sqlite:///diabetesData.db')
diabetes_df = pd.read_sql_query('select * from diabetes', con=engine)

# Define features set and target vector
X = diabetes_df.drop("diabetes", axis=1)
y = diabetes_df["diabetes"].to_numpy()

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Model 1. Random Forest

In [11]:
# --- Model 1: Initial Random Forest Classifier (Baseline Model) ---
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions = rf_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep = classification_report(y_test, predictions)

# Feature importances
importances = rf_model.feature_importances_

# Log results for the baseline model to csv
log_results_to_csv('RandomForestClassifier', {'n_estimators': 500}, accuracy, cm_df, classification_rep, importances)

Model 2. Logistic Regression

In [None]:
# --- Model 2: Logistic Regression ---
lr_model = LogisticRegression(random_state=78)
lr_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_lr = lr_model.predict(X_test_scaled)
accuracy_lr = accuracy_score(y_test, predictions_lr)
cm_lr = confusion_matrix(y_test, predictions_lr)
cm_df_lr = pd.DataFrame(cm_lr, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_lr = classification_report(y_test, predictions_lr)

# Log results for the Logistic Regression model to csv
log_results_to_csv('LogisticRegression', {'solver': 'lbfgs'}, accuracy_lr, cm_df_lr, classification_rep_lr, None)

# --- Model 2.5: Optimized Logistic Regression ---

# Hyperparameter grid for LogisticRegression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs']
}

# GridSearchCV for LogisticRegression
lr_grid_search = GridSearchCV(LogisticRegression(random_state=78), param_grid_lr, cv=5, n_jobs=-1, scoring='accuracy')
lr_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
lr_optimized = lr_grid_search.best_estimator_

# Log results for the optimized Logistic Regression model
predictions_lr_optimized = lr_optimized.predict(X_test_scaled)
accuracy_lr_optimized = accuracy_score(y_test, predictions_lr_optimized)
cm_lr_optimized = confusion_matrix(y_test, predictions_lr_optimized)
cm_df_lr_opt = pd.DataFrame(cm_lr_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_lr_opt = classification_report(y_test, predictions_lr_optimized)

# Log the results of the  optimized Logistic Regression Model to csv
log_results_to_csv('LogisticRegression_Optimized', lr_grid_search.best_params_, accuracy_lr_optimized, cm_df_lr_opt, classification_rep_lr_opt, None)

Model 3. Decision Tree

In [None]:
# --- Model 3: Decision Tree Classifier ---
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=78)
dt_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_dt = dt_model.predict(X_test_scaled)
accuracy_dt = accuracy_score(y_test, predictions_dt)
cm_dt = confusion_matrix(y_test, predictions_dt)
cm_df_dt = pd.DataFrame(cm_dt, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_dt = classification_report(y_test, predictions_dt)

# Log the results of the Decision Tree Classifier model to csv
log_results_to_csv('DecisionTreeClassifier', {'max_depth': None}, accuracy_dt, cm_df_dt, classification_rep_dt, None)

# --- Model 3.5: Decision Tree Classifier Optimized ---

# Hyperparameter grid for DecisionTreeClassifier
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [None, 2, 5, 10],
    'min_samples_leaf': [None, 1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearchCV for DecisionTreeClassifier
dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=78), param_grid_dt, cv=5, n_jobs=-1, scoring='accuracy')
dt_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
dt_optimized = dt_grid_search.best_estimator_

# Log results for the optimized DecisionTreeClassifier model
predictions_dt_optimized = dt_optimized.predict(X_test_scaled)
accuracy_dt_optimized = accuracy_score(y_test, predictions_dt_optimized)
cm_dt_optimized = confusion_matrix(y_test, predictions_dt_optimized)
cm_df_dt_opt = pd.DataFrame(cm_dt_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_dt_opt = classification_report(y_test, predictions_dt_optimized)

# Log results for the optimized DecisionTreeClassifier model to csv
log_results_to_csv('DecisionTreeClassifier_Optimized', dt_grid_search.best_params_, accuracy_dt_optimized, cm_df_dt_opt, classification_rep_dt_opt, None)

Model 4. SVC/SVM Model

In [14]:
# --- Model 4: SVM Model ---
from sklearn.svm import SVC

svm_model = SVC(random_state=78)
svm_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_svm = svm_model.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, predictions_svm)
cm_svm = confusion_matrix(y_test, predictions_svm)
cm_df_svm = pd.DataFrame(cm_svm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_svm = classification_report(y_test, predictions_svm)

# Log the results of the SVM Model to csv
log_results_to_csv('SVC', {'kernel': 'rbf'}, accuracy_svm, cm_df_svm, classification_rep_svm, None)

Model 5. KNearestNeighbors

In [15]:
# --- Model 5: KNNeighbors Model ---
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_knn = knn_model.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, predictions_knn)
cm_knn = confusion_matrix(y_test, predictions_knn)
cm_df_knn = pd.DataFrame(cm_knn, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_knn = classification_report(y_test, predictions_knn)

# Log the results of the KNN Model to csv
log_results_to_csv('KNN', {'n_neighbors': 5}, accuracy_knn, cm_df_knn, classification_rep_knn, None)

Model 6. GaussianNB

In [16]:
# --- Model 6: Naive Bayes/GaussianNB Model ---
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_nb = nb_model.predict(X_test_scaled)
accuracy_nb = accuracy_score(y_test, predictions_nb)
cm_nb = confusion_matrix(y_test, predictions_nb)
cm_df_nb = pd.DataFrame(cm_nb, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_nb = classification_report(y_test, predictions_nb)

# Log the results of the GaussianNB Model to csv
log_results_to_csv('GaussianNB', {}, accuracy_nb, cm_df_nb, classification_rep_nb, None)

Model 7. Gradient Boosting Classifier

In [17]:
# --- Model 7: Gradient Boosting Model ---
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=78)
gb_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_gb = gb_model.predict(X_test_scaled)
accuracy_gb = accuracy_score(y_test, predictions_gb)
cm_gb = confusion_matrix(y_test, predictions_gb)
cm_df_gb = pd.DataFrame(cm_gb, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_gb = classification_report(y_test, predictions_gb)

# Log the results of the Gradient Boosting Model to csv
log_results_to_csv('GradientBoostingClassifier', {}, accuracy_gb, cm_df_gb, classification_rep_gb, None)

# --- Model 7.5: Optimized Gradient Boosting Model ---

# Hyperparameter grid for GradientBoostingClassifier
param_grid_gb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'max_depth': [3, 5, 10],
    'subsample': [0.8, 0.9, 1.0]
}

# GridSearchCV for GradientBoostingClassifier
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=78), param_grid_gb, cv=5, n_jobs=-1, scoring='accuracy')
gb_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
gb_optimized = gb_grid_search.best_estimator_

# Log results for the optimized GBC model
predictions_gb_optimized = gb_optimized.predict(X_test_scaled)
accuracy_gb_optimized = accuracy_score(y_test, predictions_gb_optimized)
cm_gb_optimized = confusion_matrix(y_test, predictions_gb_optimized)
cm_df_gb_opt = pd.DataFrame(cm_gb_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_gb_opt = classification_report(y_test, predictions_gb_optimized)

# Log results for the optimized GBC model to csv
log_results_to_csv('GradientBoostingClassifier_Optimized', gb_grid_search.best_params_, accuracy_gb_optimized, cm_df_gb_opt, classification_rep_gb_opt, None)

Model 8. AdaBoost Classifier

In [None]:
# --- Model 8: AdaBoost Classifier ---
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier(random_state=78)
ab_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_ab = ab_model.predict(X_test_scaled)
accuracy_ab = accuracy_score(y_test, predictions_ab)
cm_ab = confusion_matrix(y_test, predictions_ab)
cm_df_ab = pd.DataFrame(cm_ab, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_ab = classification_report(y_test, predictions_ab)

# Log the results of the AdaBoost Model to csv
log_results_to_csv('AdaBoostClassifier', {}, accuracy_ab, cm_df_ab, classification_rep_ab, None)

# --- Model 8.5: AdaBoost Classifier Optimized ---

# Hyperparameter grid for AdaBoostClassifier
param_grid_ab = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

# GridSearchCV for AdaBoostClassifier
ab_grid_search = GridSearchCV(AdaBoostClassifier(random_state=78), param_grid_ab, cv=5, n_jobs=-1, scoring='accuracy')
ab_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
ab_optimized = ab_grid_search.best_estimator_

# Log results for the optimized AdaBoost model
predictions_ab_optimized = ab_optimized.predict(X_test_scaled)
accuracy_ab_optimized = accuracy_score(y_test, predictions_ab_optimized)
cm_ab_optimized = confusion_matrix(y_test, predictions_ab_optimized)
cm_df_ab_opt = pd.DataFrame(cm_ab_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_ab_opt = classification_report(y_test, predictions_ab_optimized)

# Log the results of the AdaBoost Optimized Model to csv
log_results_to_csv('AdaBoostClassifier_Optimized', ab_grid_search.best_params_, accuracy_ab_optimized, cm_df_ab_opt, classification_rep_ab_opt, None)

Model 9. XGBoost

In [19]:
# --- Model 9: XGBoost Model ---
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=78)
xgb_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_xgb = xgb_model.predict(X_test_scaled)
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
cm_xgb = confusion_matrix(y_test, predictions_xgb)
cm_df_xgb = pd.DataFrame(cm_xgb, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_xgb = classification_report(y_test, predictions_xgb)

# Log the results of the XGBoost Model to csv
log_results_to_csv('XGBoostClassifier', {}, accuracy_xgb, cm_df_xgb, classification_rep_xgb, None)

# --- Model 9.5: XGBoost Model Optimized ---

# Hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 0.9, 1.0]
}

# GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(xgb.XGBClassifier(random_state=78), param_grid_xgb, cv=5, n_jobs=-1, scoring='accuracy')
xgb_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
xgb_optimized = xgb_grid_search.best_estimator_

# Log results for the optimized XGBoost model
predictions_xgb_optimized = xgb_optimized.predict(X_test_scaled)
accuracy_xgb_optimized = accuracy_score(y_test, predictions_xgb_optimized)
cm_xgb_optimized = confusion_matrix(y_test, predictions_xgb_optimized)
cm_df_xgb_opt = pd.DataFrame(cm_xgb_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_xgb_opt = classification_report(y_test, predictions_xgb_optimized)

# Log the results of the XGBoost Optimized Model to csv
log_results_to_csv('XGBoostClassifier_Optimized', xgb_grid_search.best_params_, accuracy_xgb_optimized, cm_df_xgb_opt, classification_rep_xgb_opt, None)

Model 10. LightGBM

In [None]:
# --- Model 10: LightGBM Model ---
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(random_state=78)
lgb_model.fit(X_train_scaled, y_train)

# Make predictions and evaluate model
predictions_lgb = lgb_model.predict(X_test_scaled)
accuracy_lgb = accuracy_score(y_test, predictions_lgb)
cm_lgb = confusion_matrix(y_test, predictions_lgb)
cm_df_lgb = pd.DataFrame(cm_lgb, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_lgb = classification_report(y_test, predictions_lgb)

# Log the results of the LightGBM Model to csv
log_results_to_csv('LightGBMClassifier', {}, accuracy_lgb, cm_df_lgb, classification_rep_lgb, None)

# --- Model 10.5: Optimized LightGBM Model ---

# Hyperparameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'num_leaves': [31, 50, 100]
}

# GridSearchCV for LightGBM
lgb_grid_search = GridSearchCV(lgb.LGBMClassifier(random_state=78), param_grid_lgb, cv=5, n_jobs=-1, scoring='accuracy')
lgb_grid_search.fit(X_train_scaled, y_train)

# Best model after GridSearchCV
lgb_optimized = lgb_grid_search.best_estimator_

# Log results for the optimized LightGBM model
predictions_lgb_optimized = lgb_optimized.predict(X_test_scaled)
accuracy_lgb_optimized = accuracy_score(y_test, predictions_lgb_optimized)
cm_lgb_optimized = confusion_matrix(y_test, predictions_lgb_optimized)
cm_df_lgb_opt = pd.DataFrame(cm_lgb_optimized, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
classification_rep_lgb_opt = classification_report(y_test, predictions_lgb_optimized)

# Log results for the optimized LightGBM model to csv
log_results_to_csv('LightGBMClassifier_Optimized', lgb_grid_search.best_params_, accuracy_lgb_optimized, cm_df_lgb_opt, classification_rep_lgb_opt, None)