In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
plt.style.use('ggplot')
# https://www.kaggle.com/fedesoriano/stroke-prediction-dataset 
df = pd.read_csv("..\\..\\data\\healthcare-dataset-stroke-data.csv")

In [None]:
# Fill missing values
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

DecisionTreePip = Pipeline(steps=[ 
                               ('Scale',StandardScaler()),
                               ('DecisionTreeReg',DecisionTreeRegressor(random_state = 42))
                              ])

X = df[['age','gender','bmi']]
X.gender = X.gender.replace({'Male' : 0, 'Female' : 1 , 'Other' : -1}).astype(np.uint8)

# create a dataframe containing the missing values of X
missing = X[X.bmi.isna()]

# remove the missing values from X 
X = X.dropna()

# creates Y by removing bmi from X
Y = X.pop('bmi')

# fit the pipeline
DecisionTreePip.fit(X,Y)

# make the prediction 
predict_bmi = pd.Series(DecisionTreePip.predict(missing[['age', 'gender']]), index = missing.index)
df.loc[missing.index, 'bmi'] = predict_bmi

In [None]:
# Generate the data samples: Original, Oversampled, Undersampled
y = df[["stroke"]].copy()
X = df.copy()

In [None]:
# ALWAYS SPLIT THE DATA IN TRAIN AND TEST AND THEN OVERSAMPLE/DOWNSAMPLE
# see here: https://stackoverflow.com/questions/48805063/balance-classes-in-cross-validation/48810493#48810493 
from sklearn.model_selection import train_test_split
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3, shuffle=True) # Set shuffle to true to have data of both labels in test set

In [None]:
X_train.drop(["id", "stroke"], inplace=True, axis=1)
X_test.drop(["id", "stroke"], inplace=True, axis=1)

In [None]:
# The oversampled/undersampled data will be used for training ONLY! not for testing

from src.Resample import undersample_kmeans
from src.Resample import oversample

df_input = X_train.copy()
df_input["stroke"] = y_train

# Get over and undersampled data
undersampled = undersample_kmeans(df_input)
oversampled = oversample(df_input)

y_over_train = oversampled[["stroke"]].copy()
y_under_train = undersampled[["stroke"]].copy()

oversampled.drop(columns=["stroke"], inplace=True)
undersampled.drop(columns=["stroke"], inplace=True)

X_over_train = oversampled.copy()
X_under_train = undersampled.copy()
X_under_train = X_under_train[["gender","age","hypertension","heart_disease","ever_married","work_type","Residence_type","avg_glucose_level","bmi","smoking_status"]] # Only do this because column order must be same as X_test column order!


In [200]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


num_pipe = Pipeline([
    #('impute', SimpleImputer(strategy='median', copy=False)),
    ('minmax_scaler', MinMaxScaler(copy=False))
])

ordinal_pipe = Pipeline([
    ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

binary_pipe = Pipeline([
    ('label_encoder', OrdinalEncoder()),
])

# two pipelines combined in the column transformer
full_transform = ColumnTransformer([
    ("num", num_pipe, ["age", "avg_glucose_level", "bmi"]),
    ("ord", ordinal_pipe, ["gender", "work_type", "smoking_status"]),
    ("binary", binary_pipe, ["ever_married", "Residence_type"]),
])

full_pipeline = Pipeline([
    ('trf', full_transform),
    # ('knn', KNeighborsClassifier(n_neighbors=7, metric='manhattan'))
    # ('svm', SVC(random_state =0))
    # ('rforest', RandomForestClassifier(criterion='entropy', n_estimators= 100, random_state= 0))
])

In [None]:
# Evaluate model
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
# Fit model/pipeline on TRAINING DATA; On either Oversampled, Undersampled, or Original data
full_pipeline.fit(X_over_train, y_over_train.values.flatten())

y_pred = full_pipeline.predict(X_test)

# KFold cross validation
cv = KFold(n_splits=3, random_state=None)
accuracies = cross_val_score(estimator = full_pipeline, X = X_over_train, y = y_over_train.values.flatten(), cv = cv) # Set input here! Over, under or original data. Only training data!

cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

stroke_recall = cm[1][1]/cm[1].sum()
stroke_cases = len(y_test[y_test["stroke"]==1])
'''
Most important metric we want to look for is the percentage of actual positives that we have identified.
This corresponds to HIGH Recall for Stroke True Positive (Recall = Percantage of accurately identified positives)
'''
print('Confusion Matrix')
print("There are {} stroke cases in test set".format(stroke_cases))
print(cm)
print("The Recall of Stroke is: {:.2f} %".format(stroke_recall*100))
print('')
print("The General Recall is: {:.2f} %".format(recall*100))
print('')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('')
print("K-Fold Validation Mean Accuracy: {:.2f} %".format(accuracies.mean()*100))
print('')
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
print('')
print('Precision: {:.2f}'.format(precision))
print('')
print('Recall: {:.2f}'.format(recall))
print('')
print('F1: {:.2f}'.format(f1))
print('-----------------------------------')

In [None]:
# ROC curve
# code inspired from https://www.analyticsvidhya.com/blog/2020/06/auc-roc-curve-machine-learning/?fbclid=IwAR0g5dlUGF53mbQQ0h9kwV8Ne9-NzYiGjlYwe72GpHbidMEEPAUD2Sgu1yo
import sklearn.metrics as metrics
import scikitplot as skplt
import matplotlib.pyplot as plt

fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC curve')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [201]:
X_transformed = full_pipeline.fit_transform(X_over_train, y_over_train)

In [None]:
# Do parameter tuning
# The GridSearchCV is a library function that is a member of sklearn's model_selection package. It helps to loop through predefined hyperparameters and fit your estimator (model) on your training set. So, in the end, you can select the best parameters from the listed hyperparameters.
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

grid_models = [(LogisticRegression(),[{'C':[0.25,0.5,0.75,1],'random_state':[0]}]), 
               (KNeighborsClassifier(),[{'n_neighbors':[5,7,8,10], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}]), 
               (SVC(),[{'C':[0.25,0.5,0.75,1],'kernel':['linear', 'rbf'],'random_state':[0]}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}]), 
              (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}])]

# Transform the input data before fitting it to GridSearch
X_transformed = full_pipeline.fit_transform(X_over_train, y_over_train)

for i,j in grid_models:
    grid = GridSearchCV(estimator = i, param_grid = j, scoring = 'recall', cv = 10)
    grid.fit(X_transformed, y_over_train.values.flatten())
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest Recall : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)
    print('')
    print('----------------')
    print('')


In [202]:
from sklearn.metrics import classification_report

scores = ["precision", "recall"]

for score in scores:
    # Transform the input data before fitting it to GridSearch
    X_transformed = full_pipeline.fit_transform(X_over_train, y_over_train)

    for i,j in grid_models:
        grid = GridSearchCV(estimator = i, param_grid = j, scoring = score, cv = 10)
        print("Best parameters set found on development set:")
        print()
        print(grid.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = grid.cv_results_["mean_test_score"]
        stds = grid.cv_results_["std_test_score"]
        for mean, std, params in zip(means, stds, grid.cv_results_["params"]):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, grid.predict(X_transformed)
        print(classification_report(y_true, y_pred))
        print()



Best parameters set found on development set:



AttributeError: 'GridSearchCV' object has no attribute 'best_params_'