In [50]:
import pandas as pd
import joblib 

from sklearn.model_selection import train_test_split 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('data/heart.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [14]:
X = df.drop(columns='HeartDisease')
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [15]:
#X_train.to_csv('base_Xtr.csv')
#X_test.to_csv('base_Xtst.csv')
#y_train.to_csv('base_ytr.csv')
#y_test.to_csv('base_ytst.csv')

In [16]:
numeric_scaling = Pipeline([('scaler', StandardScaler())])
categoric_encoding = Pipeline([('encoder',OneHotEncoder())])

In [17]:
num_cols = df.select_dtypes(include=['int64','float64']).columns.drop('HeartDisease')

cat_cols = df.select_dtypes(include=['object']).columns

In [18]:
preprocessing = ColumnTransformer([
    ('numeric_features',numeric_scaling, num_cols),
    ('categoric_features',categoric_encoding,cat_cols)],remainder='passthrough')

In [19]:
base = Pipeline(steps=[('processing', preprocessing),
                           ('classifier',KNeighborsClassifier())])

In [20]:
base.fit(X_train,y_train)

Pipeline(steps=[('processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric_features',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                 ('categoric_features',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object'))])),
                ('classifier', KNeighborsClassifier())])

In [22]:
y_pred = base.predict(X_test)
accuracy_score(y_pred,y_test)

0.8623188405797102

In [25]:
#joblib.dump(base, 'saved-models/baseKNclf.joblib') 

['saved-models/baseKNclf.joblib']

#### Grid Search on base model

In [51]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__n_neighbors':[4,5,6,7,8,9],
             'classifier__leaf_size':[10,20,30,40]
             }
base_grid = GridSearchCV(base, param_grid=param_grid, cv=5)
base_grid.fit(X_train, y_train)

CPU times: user 5.38 s, sys: 556 ms, total: 5.94 s
Wall time: 3.25 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [52]:
best_base = base_grid.best_estimator_
#joblib.dump(best_base,'KNclf_gridsearch.joblib')

['KNclf_gridsearch.joblib']

In [53]:
best_base.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric_features',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                 ('categoric_features',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object'))])),
                ('classifier',
                 KNeighborsClassifier(leaf_size=10, n_neighbors=7))])>

In [54]:
y_pred = best_base.predict(X_test)
accuracy_score(y_pred,y_test)

0.8623188405797102

#### Grid Search on base model with PCA

In [56]:
base_pca = Pipeline(steps=[('processing', preprocessing),
                           ('pca',PCA()),
                           ('classifier',KNeighborsClassifier())])

In [74]:
%%time
# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'pca__n_components':[6,8,10,12],
             'classifier__n_neighbors':[4,5,6,7,8,9],
             'classifier__leaf_size':[10,20,30,40]
             }
base_pca_grid = GridSearchCV(base_pca, param_grid=param_grid, cv=5)
base_pca_grid.fit(X_train, y_train)

CPU times: user 24.7 s, sys: 2.17 s, total: 26.9 s
Wall time: 15 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('processing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numeric_features',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                                        ('categoric_features',
                                                                         Pipeline(steps=[('encoder',
                                                                                          OneHotEncoder())]),
                                                                   

In [75]:
best_pca_base = base_pca_grid.best_estimator_
#joblib.dump(best_pca_base,'KNclf_gridsearch_pca.joblib')

['KNclf_gridsearch_pca.joblib']

In [76]:
best_pca_base.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric_features',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak'], dtype='object')),
                                                 ('categoric_features',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder())]),
                                                  Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object'))])),
                ('pca', PCA(n_components=12)),
                ('classifier',
                 KNeighborsClas

In [77]:
y_pred = best_pca_base.predict(X_test)
accuracy_score(y_pred,y_test)

0.8659420289855072