# Predicting Diabetes - Binary Classifcation
Macro F1 score is the evaluation metric

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Import and Split Data

In [2]:
df_train = pd.read_csv("diabetes_train.csv")

In [3]:
from sklearn.model_selection import train_test_split

Id_col = 'Id'
target_col = 'diabetes'

X = df_train.drop([Id_col, target_col], axis=1)
y = df_train[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train Logistic Regression Model

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [5]:
# Create pipeline with feature engineering transformer and logistic regression
pipe = make_pipeline(StandardScaler(),
                    LogisticRegression(random_state=42))

### Apply KNN Imputer

In [8]:
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

In [9]:
# Make column transformer with KNN imputer
imputer_transformer = ColumnTransformer(transformers=[('knnimputer', KNNImputer(missing_values=0),
                                              ['plasma_glucose', 'DBP', 'triceps_skin', 'BMI'])])

In [10]:
# Create pipeline with KNN imputer transformer, scaler, and logistic regression
imputer_pipe = make_pipeline(imputer_transformer,
                    StandardScaler(),
                    LogisticRegression(random_state=42))

## Cross-validation

In [29]:
# Cross-validate the model and print the results for non-imputed data
cv_scores = cross_val_score(pipe, X_train, y_train, scoring='f1_macro', cv=5)

# Calculate mean and standard deviation of scores
avg = cv_scores.mean()
stddev = cv_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in cv_scores])
print(f"Mean score: {round(avg, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Scores: [0.7527, 0.7734, 0.7117, 0.6133, 0.7986]
Mean score: 0.7299
+/-2 std. dev. range within mean: (0.6001, 0.8597)


In [14]:
# Cross-validate the model and print the results for KNN imputed data
cv_scores = cross_val_score(imputer_pipe, X_train, y_train, scoring='f1_macro', cv=5)

# Calculate mean and standard deviation of scores
avg = cv_scores.mean()
stddev = cv_scores.std()

# Print results
print("Scores:", [round(score, 4) for score in cv_scores])
print(f"Mean score: {round(avg, 4)}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Scores: [0.7774, 0.7482, 0.6823, 0.6689, 0.7879]
Mean score: 0.733
+/-2 std. dev. range within mean: (0.6354, 0.8305)


## Hyperparameter Tuning

Hyperparameter tuning knn imputer pipeline since it led to better cross-validation performance.

Hyperparameters to tune for logistic regression:

* `penalty`: type of regularization used
* `C`: regularization strength where making the value smaller increases the strength
* `solver`: optimization algorithm used

Different solver algorithms used in logistic regression support different sets of penalties. To try different solver hyperparameters without error, a grid is made for:

* `'newton-cg', 'lbfgs', 'sag',` and `'saga'` solvers with `'l2'` and `'none'` as the penalties
* `'saga'` solver with `'l1'` as the penality
* `'liblinear'` solver with `'l1'` and `'l2'` as the penalities

In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

### GridSearchCV

In [16]:
# Set potential hyperparameter grid for solvers that support 'l2' and 'none' penalities
gs_params1 = {'logisticregression__penalty': ('l2', 'none'),
          'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
          'logisticregression__solver': ('newton-cg', 'lbfgs', 'sag', 'saga')}

# Set potential hyperparameter grid for saga solver to evaluate how performant model is when penalty='l1'
gs_params2 = {'logisticregression__penalty': ['l1'],
          'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
          'logisticregression__solver': ['saga']}

# Set potential hyperparameter grid for liblinear solver which support 'l1' and 'l2' penalties
gs_params3 = {'logisticregression__penalty': ('l1', 'l2'),
          'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
          'logisticregression__solver': ['liblinear']}

In [17]:
# Perform GridSearchCV
logit_gs = GridSearchCV(imputer_pipe, param_grid=[gs_params1, gs_params2, gs_params3], 
                        scoring='f1_macro', cv=5, n_jobs=-1, return_train_score=True)

logit_gs.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('knnimputer',
                                                                         KNNImputer(missing_values=0),
                                                                         ['plasma_glucose',
                                                                          'DBP',
                                                                          'triceps_skin',
                                                                          'BMI'])])),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=42))]),
             n_jobs=-1,
             param_grid=[{'logisticregression__C': [0.001, 0.01, 0.1,...
                          'logisticregression__solver': ('newton-

In [22]:
# Print the hyperparameters, score, standard deviation, and standard deviation range of the 
# best performing model from GridSearchCV

avg = logit_gs.best_score_
stddev = logit_gs.cv_results_['std_test_score'][logit_gs.best_index_]

print(f"Best Hyperparameters: {logit_gs.best_params_}")
print(f"Best Mean Score: {avg:.4f}")
print(f"Best Mean Std. Dev.: {stddev:.4f}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Best Hyperparameters: {'logisticregression__C': 0.001, 'logisticregression__penalty': 'none', 'logisticregression__solver': 'newton-cg'}
Best Mean Score: 0.7359
Best Mean Std. Dev.: 0.0500
+/-2 std. dev. range within mean: (0.6358, 0.8360)


### RandomSearchCV
Randomized search is similar to grid search however, randomly selected values from a continuous distribution will be used for the `C` hyperparameter.

In [20]:
# Set potential hyperparameter grid for solvers that support 'l2' and 'none' penalities
rs_params1 = {'logisticregression__penalty': ('l2', 'none'),
          'logisticregression__C': uniform(0.0001, 10000),
          'logisticregression__solver': ('newton-cg', 'lbfgs', 'sag', 'saga')}

# Set potential hyperparameter grid for saga solver to evaluate how performant model is when penalty='l1'
rs_params2 = {'logisticregression__penalty': ['l1'],
          'logisticregression__C': uniform(0.0001, 10000),
          'logisticregression__solver': ['saga']}

# Set potential hyperparameter grid for liblinear solver which support 'l1' and 'l2' penalties
rs_params3 = {'logisticregression__penalty': ('l1', 'l2'),
          'logisticregression__C': uniform(0.0001, 10000),
          'logisticregression__solver': ['liblinear']}

In [21]:
# Perform RandomizedSearchCV 
logit_rs = RandomizedSearchCV(imputer_pipe, param_distributions=[rs_params1, rs_params2, rs_params3], 
                              n_iter=1000, scoring='f1_macro', cv=5, n_jobs=-1, return_train_score=True)

logit_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('knnimputer',
                                                                               KNNImputer(missing_values=0),
                                                                               ['plasma_glucose',
                                                                                'DBP',
                                                                                'triceps_skin',
                                                                                'BMI'])])),
                                             ('standardscaler',
                                              StandardScaler()),
                                             ('logisticregression',
                                              LogisticRegression(random_state=42))]),
                   n_iter=1000, n_jobs=-1,
       

In [23]:
# Print the hyperparameters, score, standard deviation, and standard deviation range of the 
# best performing model from RandomSearchCV

avg = logit_rs.best_score_
stddev = logit_rs.cv_results_['std_test_score'][logit_rs.best_index_]

print(f"Best Hyperparameters: {logit_rs.best_params_}")
print(f"Best Mean Score: {avg:.4f}")
print(f"Best Mean Std. Dev.: {stddev:.4f}")
print(f"+/-2 std. dev. range within mean: ({avg - 2*stddev:.4f}, {avg + 2*stddev:.4f})")

Best Hyperparameters: {'logisticregression__C': 6116.223924168812, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
Best Mean Score: 0.7359
Best Mean Std. Dev.: 0.0500
+/-2 std. dev. range within mean: (0.6358, 0.8360)


## Evaluate on Test Data

In [24]:
from sklearn.metrics import classification_report

In [31]:
y_pred = logit_rs.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        81
           1       0.72      0.60      0.66        35

    accuracy                           0.81       116
   macro avg       0.78      0.75      0.76       116
weighted avg       0.80      0.81      0.80       116

