In [1]:
import pandas as pd 


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve, fbeta_score, f1_score 
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler

import imblearn.over_sampling


from sklearn.metrics import log_loss
from sklearn import metrics

import warnings
warnings.simplefilter(action='ignore')


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv('cleaned_dataset.csv')
df

Unnamed: 0,heart_attack_disease,sex,age,race_ethn,bmi,overweight,diabetes,diagnose_strk,smoker,high_bp,high_chol,heavy_drinker,exercise,fruit,vege
0,0.0,1.0,70.0,0.0,15.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1,1.0,1.0,67.0,1.0,28.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
2,1.0,1.0,72.0,1.0,28.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,62.0,0.0,33.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
4,1.0,0.0,76.0,5.0,29.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381142,0.0,1.0,30.0,1.0,25.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
381143,0.0,0.0,80.0,5.0,28.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
381144,0.0,0.0,54.0,1.0,31.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
381145,0.0,0.0,67.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [3]:
df.shape

(381147, 15)

In [4]:
df.heart_attack_disease.value_counts()

0.0    347607
1.0     33540
Name: heart_attack_disease, dtype: int64

In [5]:
df.heart_attack_disease.value_counts(normalize = True)

0.0    0.912002
1.0    0.087998
Name: heart_attack_disease, dtype: float64

In [6]:
#class is imbalanced , will use techniques to handle imbalance in order to improve metric scores

In [7]:
X = df.drop(['heart_attack_disease'], axis = 1)
y = df['heart_attack_disease']

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size = 0.3, 
                                                    random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


#for gridsearchcv w/o class_weight param

X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(X,y, 
                                                    test_size = 0.3, 
                                                    random_state=42, 
                                                    stratify = y)

X_train_strat_sc = scaler.fit_transform(X_train_strat)
X_test_strat_sc = scaler.transform(X_test_strat)

In [9]:
X_train.shape

(266802, 14)

In [10]:
X_test.shape

(114345, 14)

In [11]:
train_df = X_train.copy()
train_df['heart_attack_disease'] = y_train
train_df.head()

Unnamed: 0,sex,age,race_ethn,bmi,overweight,diabetes,diagnose_strk,smoker,high_bp,high_chol,heavy_drinker,exercise,fruit,vege,heart_attack_disease
300912,1.0,55.0,0.0,21.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
40557,1.0,50.0,0.0,20.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
219110,0.0,51.0,0.0,32.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
343902,1.0,60.0,0.0,28.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
313360,0.0,66.0,0.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [12]:
print(X_train.shape)
print(X_test.shape)

(266802, 14)
(114345, 14)


In [13]:
def make_confusion_matrix(model, threshold = 0.5):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    # (model.predict(X_test) does this automatically with a threshold of 0.5)
    y_predict = (model.predict_proba(X_test_s)[:, 1] >= threshold)
    fraud_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=80)
    sns.heatmap(fraud_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
           xticklabels=['no disease', 'disease'],
           yticklabels=['no disease', 'disease']);
    plt.xlabel('prediction')
    plt.ylabel('actual')

KNN + GridSearchCV

In [14]:
k_range = list(range(1, 31))
leaf_input = list(range(1, 50))
p_input = [1,2]
n_jobs_input = [-1,1]
weight_options = ['uniform', 'distance']

param_grid_knn = dict(n_neighbors=k_range, weights=weight_options, 
                      leaf_size = leaf_input, p = p_input, 
                     n_jobs = n_jobs_input)


In [16]:
knn =  KNeighborsClassifier()

grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='f1')
grid_knn.fit(X_train_strat_sc, y_train_strat)

knn_grid_pred = grid_knn.predict(X_test_strat_sc)

knn_grid_best_para = grid_knn.best_params


KeyboardInterrupt: 

Logistic
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

Random Forest
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

XGBoost 
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

Oversampling 

KNN
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

Logistic
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

Random Forest
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

XGBoost 
1a. Class Weights 
1b. Best Threshold
1c. Best Hyperparameter

In [None]:
# merge df to compare and then select best model to work with 

In [None]:
#knn 
knn =  KNeighborsClassifier()
knn.fit(X_train_s, y_train)
knn_pred = knn.predict(X_test_s)



# need to print out metric scores for accuracy, precision, recall, F1 
knn_acc = knn.score(X_test_s, y_test)
knn_precision= precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)

print(f'KNN Baseline- Accuracy: {knn_acc:.6f}')
print(f'KNN Baseline- Precision: {knn_precision:.6f}')
print(f'KNN Baseline- Recall : {knn_recall:.6f}')
print(f'KNN Baseline- F1 : {knn_f1:.6f}')
    

In [None]:
make_confusion_matrix(knn)

In [None]:
print(classification_report(y_test, knn_pred))

In [None]:
#logistic regression 
#set to no regularization
lr =  LogisticRegression(C=1000)
lr.fit(X_train_s, y_train)
lr_pred = lr.predict(X_test_s)



# need to print out metric scores for accuracy, precision, recall, F1 
lr_acc = lr.score(X_test_s, y_test)
lr_precision= precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)

print(f'Logistic Baseline- Accuracy: {lr_acc:.6f}')
print(f'Logistic Baseline- Precision: {lr_precision:.6f}')
print(f'Logistic Baseline- Recall : {lr_recall:.6f}')
print(f'Logistic Baseline- F1 : {lr_f1:.6f}')
    

In [None]:
make_confusion_matrix(lr)

In [None]:
print(classification_report(y_test, lr_pred))

ROC Curve

In [None]:
knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn.predict_proba(X_test_s)[:,1])
roc_auc_knn = auc(knn_fpr,knn_tpr )
lr_fpr, lr_tpr, lr_thresholds = roc_curve(y_test, lr.predict_proba(X_test_s)[:,1])
roc_auc_lr = auc(lr_fpr,lr_tpr )

plt.plot(knn_fpr, knn_tpr, c = 'orange', lw=2, label = f'knn, auc = {roc_auc_knn}')
plt.plot(lr_fpr, lr_tpr, c = 'red', lw=2, label = f'Logistic Regression, auc = {roc_auc_lr}')
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc = 'lower right')
plt.title('ROC curves');


In [None]:
print(type(knn_precision))

In [None]:
acc = [knn_acc,lr_acc]
precision = [knn_precision, lr_precision]
recall = [knn_recall, lr_recall]
f1 = [knn_f1, lr_f1]
auc = [roc_auc_knn, roc_auc_lr]

index_labels = ['knn', 'logistic regression']
zipped = list(zip(acc, precision, recall, f1, auc))
scores_df = pd.DataFrame(zipped, columns = ['accuracy', 'precision', 'recall', 
                                         'f1', 'auc'], index = index_labels)

scores_df

In [None]:
x = 5553533.023423422



In [None]:
def validation_metrics(model, y_test, X_test_s): 
    