In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neural_network import MLPClassifier

from lightgbm import LGBMClassifier

In [2]:
df_heart = pd.read_csv('https://tinyurl.com/heartdiseaseprocessed')
df_heart_num = pd.read_csv('https://tinyurl.com/heartdiseaseprocessednumeric')

# convert appropriate features to categorical
for col in df_heart.select_dtypes(include='object').columns:
    df_heart[col] = df_heart[col].astype('category')

X = df_heart_num.drop(columns=['heart_disease']).to_numpy()
y = df_heart_num['heart_disease'].to_numpy()

In [3]:
prevalence = np.mean(y)
majority = np.max([prevalence, 1 - prevalence])

## Penalized Linear Models/Elastic Net

In [4]:
model_elastic = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.5,
    random_state=42,
    max_iter=10000,
    verbose=False,
)

# use cross-validation to estimate performance
cv_elastic = cross_validate(
    model_elastic,
    X,
    y,
    cv=5,
    scoring='accuracy',
)

In [5]:
print(
    'Training accuracy: ',
    np.round(cv_elastic['test_score'].mean(), 3),
    '\nGuessing: ',
    np.round(majority, 3),
)

Training accuracy:  0.828 
Guessing:  0.539


## Tree-based Models

In [6]:
model_boost = LGBMClassifier(
    n_estimators = 1000,
    learning_rate = 1e-3,
    max_depth = 5,
    verbose = -1,
    random_state=42,
)

model_boost_cv = cross_validate(
    model_boost,
    df_heart.drop(columns='heart_disease'),
    df_heart['heart_disease'],
    cv = 5,
    scoring='accuracy',
)

In [None]:
print(
    'Training accuracy: ',
    np.round(np.mean(model_boost_cv['test_score']), 3),
    '\nGuessing: ',
    np.round(majority, 3),
)

## Deep Learning Example

In [None]:
model_mlp = MLPClassifier(
    hidden_layer_sizes = (200, 200, 200),  
    learning_rate = 'adaptive',
    learning_rate_init = 0.001,
    shuffle = True,
    random_state = 123,
    warm_start = True,
    nesterovs_momentum = True,
    validation_fraction =  .2,
    verbose = False,
)

# with the above settings, this will take a few seconds
model_mlp_cv = cross_validate(
    model_mlp, 
    X, 
    y, 
    cv = 5
) 

In [None]:
print(
    'Training accuracy: ',
    np.round(np.mean(model_mlp_cv['test_score']), 3),
    '\nGuessing: ',
    np.round(majority, 3),
)

## A Tuned Example

In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df_heart.drop(columns='heart_disease'), 
    df_heart_num['heart_disease'],
    test_size = 0.2,
    random_state = 42
)

model_boost = LGBMClassifier(
    verbose = -1
)

param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [1e-3, 1e-2, 1e-1],
    'max_depth': [3, 5, 7, 9],
    'min_child_samples': [1, 5, 10],
}

# this will take a few seconds
model_boost_cv_tune = RandomizedSearchCV(
    model_boost, 
    param_grid, 
    n_iter = 10,
    cv = 5, 
    scoring = 'accuracy', 
    n_jobs = -1,
    random_state = 42
)

model_boost_cv_tune.fit(X_train, y_train)

test_predictions = model_boost_cv_tune.predict(X_test)
accuracy_score(y_test, test_predictions)

## Model Comparison

In [None]:
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    acc = accuracy_score(y_true, y_pred)
    tpr = recall_score(y_true, y_pred)  # True Positive Rate (Sensitivity)
    tnr = tn / (tn + fp)  # True Negative Rate (Specificity)
    f1 = f1_score(y_true, y_pred)
    ppv = precision_score(y_true, y_pred)  # Positive Predictive Value (Precision)
    npv = tn / (tn + fn)  # Negative Predictive Value
    return {
        'acc': acc,
        'tpr': tpr,
        'tnr': tnr,
        'f1': f1,
        'ppv': ppv,
        'npv': npv
    }

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return calculate_metrics(y_test, y_pred)


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
model_glmnet = LogisticRegression(solver='saga', max_iter=10000, random_state=42)
model_boost = LGBMClassifier(random_state=42)
model_mlp = MLPClassifier(max_iter=10000, random_state=42)

# Define parameter grids for each model
param_grid_glmnet = {
    'penalty': ['elasticnet'],
    'l1_ratio': [0.1, 0.5, 0.9],
    'C': [0.01, 0.1, 1, 10]
}

param_grid_boost = {
    'n_estimators': [250, 500, 1000],
    'learning_rate': [1e-3, 0.01, 0.1, 0.5],
    'max_depth': [2, 3, 5, 7],
    'min_child_samples': [1, 5, 10],
    'colsample_bytree': [0.75, 1.0],
    'reg_alpha': [0.0, 1],
    'reg_lambda': [0.0, 1]
}

param_grid_mlp = {
    'hidden_layer_sizes': [(100, 100), (200, 200), (100, 100, 100), (200, 200, 200)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Perform hyperparameter tuning
grid_search_glmnet = RandomizedSearchCV(model_glmnet, param_grid_glmnet, cv=10, scoring='accuracy')
grid_search_boost = RandomizedSearchCV(model_boost, param_grid_boost, cv=10, scoring='accuracy')
grid_search_mlp = RandomizedSearchCV(model_mlp, param_grid_mlp, cv=10, scoring='accuracy')

# Fit the grid searches
grid_search_glmnet.fit(X_train, y_train)
grid_search_boost.fit(X_train, y_train)
grid_search_mlp.fit(X_train, y_train)

# Get the best models
best_glmnet = grid_search_glmnet.best_estimator_
best_boost = grid_search_boost.best_estimator_
best_mlp = grid_search_mlp.best_estimator_

In [None]:
# Evaluate the best models on the test set
results_glmnet = evaluate_model(best_glmnet, X_train, y_train, X_test, y_test)
results_boost = evaluate_model(best_boost, X_train, y_train, X_test, y_test)
results_mlp = evaluate_model(best_mlp, X_train, y_train, X_test, y_test)

# Aggregate results into a DataFrame for comparison
results_df = pd.DataFrame({
    'GLMNet': results_glmnet,
    'Boost': results_boost,
    'MLP': results_mlp
})

In [None]:
results = results_df.T.reset_index().rename(columns={'index': 'model'})

In [None]:
results

In [None]:
# results.to_csv('../ml/data/ml-common-py_model_comparison.csv', index=False)

In [None]:
# # Make predictions on the test set
# y_pred_glmnet = best_glmnet.predict(X_test)
# y_pred_boost = best_boost.predict(X_test)
# y_pred_mlp = best_mlp.predict(X_test)

# # Combine predictions with observed values into a DataFrame
# df_predictions = pd.DataFrame({
#     'Observed': y_test,
#     'GLMNet_Predictions': y_pred_glmnet,
#     'Boost_Predictions': y_pred_boost,
#     'MLP_Predictions': y_pred_mlp
# })

# df_predictions.to_csv('../ml/data/ml-common-py_predictions.csv', index=False)


## Interpretation and Feature Importance

In [None]:
best_model = model_boost_cv_tune.best_estimator_
best_model.feature_importances_

# you remember which feature is which, right? if not, do this:
pd.DataFrame({
    'Feature': best_model.feature_name_,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

In [None]:
PartialDependenceDisplay.from_estimator(
    model_boost_cv_tune, 
    df_heart.drop(columns='heart_disease'), 
    features=['cholesterol', 'male'], 
    categorical_features=['male'], 
    percentiles=(0, .9),
    grid_resolution=75
)