In [None]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FastICA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

from typing import List


# 0: Data Loading, Set Splitting Functions, Data Resampling

In [8]:
diabetes_df = pd.read_csv('data/diabetes.csv')


In [10]:
def get_train_test_diabetes(diabetes_df, test_size=0.2, random_state=42, stratify=True, resample=True) -> tuple:
    """
    Returns a tuple of (X_train, X_test, y_train, y_test) for the diabetes dataset.
    """
    

    
    # Split the data into features and target variable
    X = diabetes_df.drop(columns=['Outcome'])
    y = diabetes_df['Outcome']
    
    # Split the dataset into training and testing sets
    
    if stratify:
        return train_test_split(X, y, random_state=random_state, test_size=test_size, stratify=y)
    else:
        return train_test_split(X, y, random_state=random_state, test_size=0.2)
    
    

X_train, X_test, y_train, y_test = get_train_test_diabetes(diabetes_df)

# 1: Create Model and Pipeline Presets

In [11]:

def get_preprocessing_pipe(scaling = True, preprocessing = "PCA"):
    """
    Returns a preprocessing pipeline with optional scaling and PCA/ICA.
    """
    pipe = []
    if preprocessing == "PCA":
        pipe.append( ('PCA', PCA(n_components=2)) )
    if preprocessing == "ICA":
        pipe.append( 
                    ('ICA', FastICA(n_components=2))
                    )
    if scaling:
        pipe.insert(0, ('Scaler', StandardScaler()))
    return pipe

def get_knn_pipe(base_pipe : List = None):
    """Returns a pipeline for KNN classifier with optional scaling and preprocessing."""
    pipe = [
        ('KNN', KNeighborsClassifier(n_neighbors=5, algorithm='auto', n_jobs=-1))
    ]
    if base_pipe:
        pipe = base_pipe + pipe
    return Pipeline(pipe)
    
def get_naive_bayes_pipe(base_pipe : List = None):
    """Returns a pipeline for Naive Bayes classifier with optional scaling and preprocessing."""
    pipe = [
        ('clf', GaussianNB())
    ]
    if base_pipe:
        pipe = base_pipe + pipe
    return Pipeline(pipe)

def get_log_reg_pipe(base_pipe = None):
    """Returns a pipeline for Logistic Regression classifier with optional scaling and preprocessing."""
    pipe = [
        ('LogisticRegression', LogisticRegression(max_iter=1000))
    ]
    if base_pipe:
        pipe = base_pipe + pipe
    return Pipeline(pipe)

def get_decision_tree_pipe(base_pipe = None):
    """Returns a pipeline for Decision Tree classifier with optional scaling and preprocessing."""
    pipe = [
        ('DecisionTree', DecisionTreeClassifier())
    ]
    if base_pipe:
        pipe = base_pipe + pipe
    return Pipeline(pipe)

def get_svm_pipe(base_pipe = None):
    """Returns a pipeline for SVM classifier with optional scaling and preprocessing."""
    pipe = [
        ('SVM', SVC(kernel='linear', probability=True))
    ]
    if base_pipe:
        pipe = base_pipe + pipe
    return Pipeline(pipe)

# 2: Creating Grid Search Presets

In [40]:


from sklearn.model_selection import KFold, StratifiedKFold


def get_preprocessing_grid_search(pipe):
    """Returns a GridSearchCV *argument* for the preprocessing pipeline. Should be combined with another pipeline component."""
    # Define the parameter grid for PCA/ICA and scaling
    param_grid = {
        'Scaler': [StandardScaler(), None],
    }
    if 'PCA' in pipe.named_steps:
        param_grid['PCA__n_components'] = [2, 3, 4, 5]
        param_grid['PCA__random_state'] = [42]
    if 'ICA' in pipe.named_steps:
        param_grid['ICA__n_components'] = [2, 3, 4, 5]
        param_grid['ICA__random_state'] = [42]
    
    return param_grid

def get_knn_grid_search(pipe, base_grid = None, scoring='accuracy', cv=StratifiedKFold(5)):
    """Returns a GridSearchCV object for KNN classifier with a parameter grid."""
    # Define the parameter grid for KNN
    param_grid = {
        'KNN__n_neighbors': [3, 5, 7, 9],
        'KNN__weights': ['uniform', 'distance'],
        'KNN__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }
    
    if base_grid:
        # Merge the base grid with the KNN grid
        param_grid = {**base_grid, **param_grid}
    
    # Create a GridSearchCV object
    knn_grid_search = GridSearchCV(estimator=pipe,param_grid=param_grid,scoring=scoring,cv=cv,n_jobs=-1)
    
    return knn_grid_search

def get_naive_bayes_grid_search(pipe, base_grid = None, scoring='accuracy', cv=StratifiedKFold(5)):
    """Returns a GridSearchCV object for Naive Bayes classifiers with a parameter grid."""
    param_grid = [
    {
        'clf': [GaussianNB()],
        'clf__var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    {
        'clf': [MultinomialNB()],
        'clf__alpha': [0.5, 1.0, 1.5]
    },
    {
        'clf': [BernoulliNB()],
        'clf__alpha': [0.5, 1.0],
        'clf__binarize': [0.0, 0.5]
    }
    ]
    
    if base_grid:
        # Merge the base grid with the Naive Bayes grid
        param_grid = {**base_grid, **param_grid}
    
    # Create a GridSearchCV object
    nb_grid_search = GridSearchCV(estimator=pipe,param_grid=param_grid,scoring=scoring,cv=cv,n_jobs=-1)
    
    return nb_grid_search

def get_log_reg_grid_search(pipe, base_grid = None, scoring='accuracy', cv=StratifiedKFold(5)):
    """Returns a GridSearchCV object for Logistic Regression with a parameter grid."""
    # Define the parameter grid for Logistic Regression
    param_grid = {
        'LogisticRegression__C': [0.01, 0.1, 1, 10, 100],
        'LogisticRegression__penalty': ['l2', None],
        'LogisticRegression__solver': ['lbfgs', 'liblinear']
    }
    
    if base_grid:
        # Merge the base grid with the Logistic Regression grid
        param_grid = {**base_grid, **param_grid}
    
    # Create a GridSearchCV object
    log_reg_grid_search = GridSearchCV(estimator=pipe,param_grid=param_grid,scoring=scoring,cv=cv,n_jobs=1
    )
    
    return log_reg_grid_search

def get_decision_tree_grid_search(pipe, base_grid = None, scoring='accuracy', cv=StratifiedKFold(5)):
    """Returns a GridSearchCV object for Decision Tree classifier with a parameter grid."""
    # Define the parameter grid for Decision Tree
    param_grid = {
        'DecisionTree__criterion': ['gini', 'entropy'],
        'DecisionTree__max_depth': [None, 5, 10, 15],
        'DecisionTree__min_samples_split': [2, 5, 10],
        'DecisionTree__min_samples_leaf': [1, 2, 4]
    }
    
    if base_grid:
        # Merge the base grid with the Decision Tree grid
        param_grid = {**base_grid, **param_grid}
    
    # Create a GridSearchCV object
    dt_grid_search = GridSearchCV(estimator=pipe,param_grid=param_grid,scoring=scoring,cv=cv,n_jobs=-1)
    
    return dt_grid_search

def get_svm_grid_search(pipe, base_grid = None, scoring='accuracy', cv=StratifiedKFold(5)):
    """Returns a GridSearchCV object for SVM classifier with a parameter grid."""
    # Define the parameter grid for SVM
    param_grid = {
        'SVM__C': [0.01, 0.1, 1, 10, 100],
        'SVM__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'SVM__gamma': ['scale', 'auto']
    }
    
    if base_grid:
        # Merge the base grid with the SVM grid
        param_grid = {**base_grid, **param_grid}
    
    # Create a GridSearchCV object
    svm_grid_search = GridSearchCV(estimator=pipe,param_grid=param_grid,scoring=scoring,cv=cv,n_jobs=-1)
    
    return svm_grid_search

In [33]:
def grid_search_preset(pipe_source, grid_search_source):
    prep_pipe = get_preprocessing_pipe(scaling=True, preprocessing="PCA")
    pipe = pipe_source(prep_pipe)
    prep_grid = get_preprocessing_grid_search(pipe)
    return grid_search_source(pipe, prep_grid, scoring='f1')

# Testing (Unofficial)

In [14]:
knn = get_knn_pipe()
X_train, X_test, y_train, y_test = get_train_test_diabetes(diabetes_df)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.7012987012987013

# Model Fitting and Scoring

In [15]:
X_train, X_test, y_train, y_test = get_train_test_diabetes(diabetes_df, random_state=42)

## KNN

In [41]:
grid_search = grid_search_preset(get_knn_pipe, get_knn_grid_search)

#----[
grid_search.fit(X_train, y_train)
#----]

best_params = grid_search.best_params_
best_score = grid_search.best_score_

#Print classification report for the best model
print("KNN Report:")
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))
print("Best Parameters:", best_params)
print("Best Score:", best_score)

KNN Report:
              precision    recall  f1-score   support

           0       0.77      0.83      0.80       100
           1       0.63      0.54      0.58        54

    accuracy                           0.73       154
   macro avg       0.70      0.68      0.69       154
weighted avg       0.72      0.73      0.72       154

Best Parameters: {'KNN__algorithm': 'auto', 'KNN__n_neighbors': 7, 'KNN__weights': 'uniform', 'PCA__n_components': 4, 'PCA__random_state': 42, 'Scaler': StandardScaler()}
Best Score: 0.6013411996955929


## Naive Bayes

In [None]:
grid_search = grid_search_preset(get_naive_bayes_pipe, get_naive_bayes_grid_search)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Naive Bayes Scoring:")



TypeError: 'list' object is not a mapping

## Logistic Regression

In [None]:
# "UserWarning: Setting penalty='None' will ignore the C and l1_ratio parameters"
# Repeated warnings coming from logreg grid search can be suppressed with this file 
# Penalty=None is not supported for the liblinear solver, so 1/4 fail, but that's okay.
from remove_warnings import *

with suppress_stdout_stderr():
    warnings.filterwarnings('ignore', category=UserWarning)
    grid_search = grid_search_preset(get_log_reg_pipe, get_log_reg_grid_search)
    grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters Logistic Regression:", best_params)
print("Best Score Logistic Regression:", best_score)

NameError: name 'grid_search_preset' is not defined

# Decision Trees

In [None]:
grid_search = grid_search_preset(get_decision_tree_pipe, get_decision_tree_grid_search)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters Decision Tree:", best_params)
print("Best Score Decision Tree:", best_score)

Best Parameters Decision Tree: {'DecisionTree__criterion': 'entropy', 'DecisionTree__max_depth': 5, 'DecisionTree__min_samples_leaf': 4, 'DecisionTree__min_samples_split': 2, 'PCA__n_components': 3, 'PCA__random_state': 42, 'Scaler': None}
Best Score Decision Tree: 0.7541383446621351


# Support Vector Machines

In [None]:
# Support Vector Machines
grid_search = grid_search_preset(get_svm_pipe, get_svm_grid_search)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters SVM:", best_params)
print("Best Score SVM:", best_score)

NameError: name 'grid_search_preset' is not defined