In [2]:
import pandas as pd 
import numpy as np 
from plotnine import *
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

In [3]:
ha = pd.read_csv("https://www.dropbox.com/s/aohbr6yb9ifmc8w/heart_attack.csv?dl=1")
ha.head()

#sex, cp, restecg, output will need to be dummified 

Unnamed: 0,age,sex,cp,trtbps,chol,restecg,thalach,output
0,63,1,3,145,233,0,150,1
1,37,1,2,130,250,1,187,1
2,56,1,1,120,236,1,178,1
3,57,0,0,120,354,1,163,1
4,57,1,0,140,192,1,148,1


## Part One: Fitting Models
This section asks you to create a final best model for each of the model types studied this week. For each, you should:

Find the best model based on ROC AUC for predicting the target variable.

Report the (cross-validated!) ROC AUC metric.

Fit the final model.

Output a confusion matrix; that is, the counts of how many observations fell into each predicted class for each true class.

(Where applicable) Interpret the coefficients and/or estimates produced by the model fit.

You should certainly try multiple model pipelines to find the best model. You do not need to include the output for every attempted model, but you should describe all of the models explored. You should include any hyperparameter tuning steps in your writeup as well.

#### Which predictors were most important to predicting heart attack risk? (output)

#### Q1: KNN

In [13]:
X = ha.drop(["output"], axis = 1)
y = ha["output"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
#KNN 

ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include = object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

#KNN 5
knn_pipeline = Pipeline(
  [("preprocessing", ct),
  ("knn",  KNeighborsClassifier())]
).set_output(transform = "pandas")

In [24]:
# tune hyperparameter k
param_grid1 = {
    "knn__n_neighbors": [2, 5, 8, 10, 15, 18, 20, 25, 30, 35, 40, 45, 50, 55, 60, 100]
}

knn_grid_search = GridSearchCV(knn_pipeline, param_grid1, cv=5, scoring='roc_auc')

knn_grid_search.fit(X, y)
print("best_params", knn_grid_search.best_params_)

best_params {'knn__n_neighbors': 55}
best_params {'knn__n_neighbors': 55}


In [25]:
knn_pipeline.set_params(knn__n_neighbors = 55)

knn_pipeline.fit(X_train, y_train)
y_pred = cross_val_predict(knn_pipeline, X, y, cv=5)

cm = confusion_matrix(y, y_pred)
cm

array([[ 89,  38],
       [ 23, 123]], dtype=int64)

In [26]:
#print(accuracy_score(y, y_pred))
print("roc auc for knn:", roc_auc_score(y, y_pred))
print(classification_report(y, y_pred))

roc auc for knn: 0.7716265774997304
              precision    recall  f1-score   support

           0       0.79      0.70      0.74       127
           1       0.76      0.84      0.80       146

    accuracy                           0.78       273
   macro avg       0.78      0.77      0.77       273
weighted avg       0.78      0.78      0.78       273

roc auc for knn: 0.7716265774997304
              precision    recall  f1-score   support

           0       0.79      0.70      0.74       127
           1       0.76      0.84      0.80       146

    accuracy                           0.78       273
   macro avg       0.78      0.77      0.77       273
weighted avg       0.78      0.78      0.78       273



In [22]:
#logistic regression
logr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("logr",  LogisticRegression())]
).set_output(transform = "pandas")

In [32]:
#Decision Tree
param_grid2 = {
    'dtree__max_depth': [3, 5, 10],
    'dtree__min_samples_split': [2, 5, 10],
    'dtree__min_samples_leaf': [1, 2, 4]
}
dtree_pipeline = Pipeline(
  [("preprocessing", ct),
  ("dtree",  DecisionTreeClassifier())]
).set_output(transform = "pandas")

dtree_grid_search = GridSearchCV(dtree_pipeline, param_grid2, cv=5, scoring='roc_auc')

dtree_grid_search.fit(X, y)
print("best_params", dtree_grid_search.best_params_)

best_params {'dtree__max_depth': 3, 'dtree__min_samples_leaf': 4, 'dtree__min_samples_split': 2}
best_params {'dtree__max_depth': 3, 'dtree__min_samples_leaf': 4, 'dtree__min_samples_split': 2}


In [54]:
dtree_pipeline.set_params(dtree__max_depth = 3, dtree__min_samples_leaf = 4, dtree__min_samples_split = 2)

dtree.fit(X_train, y_train)
y_pred2 = cross_val_predict(best_dtree, X, y, cv=5)

cm = confusion_matrix(y, y_pred2)
cm

array([[ 99,  28],
       [ 37, 109]], dtype=int64)

In [50]:
print("roc auc for dtree:", roc_auc_score(y, y_pred2))
print(classification_report(y, y_pred2))

roc auc for dtree: 0.7630514507604358
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       127
           1       0.80      0.75      0.77       146

    accuracy                           0.76       273
   macro avg       0.76      0.76      0.76       273
weighted avg       0.76      0.76      0.76       273

roc auc for dtree: 0.7630514507604358
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       127
           1       0.80      0.75      0.77       146

    accuracy                           0.76       273
   macro avg       0.76      0.76      0.76       273
weighted avg       0.76      0.76      0.76       273



In [None]:
#knn importancces

In [None]:
#log regression importances

In [57]:
#dtree importances
best_dtree = dtree_grid_search.best_estimator_.named_steps['dtree']

if hasattr(ct, "get_feature_names_out"):
    feature_names = ct.get_feature_names_out()
else:
    feature_names = X_train.columns  # Fallback if no transformations were applied

importances = best_dtree.feature_importances_
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
2,standardize__cp,0.584983
0,standardize__age,0.156569
6,standardize__thalach,0.116935
1,standardize__sex,0.102721
3,standardize__trtbps,0.038793
4,standardize__chol,0.0
5,standardize__restecg,0.0


## Q4: Interpretation

Which predictors were most important to predicting heart attack risk? 