In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
labels = pd.read_csv('Data/labels.csv')
labels

In [None]:
values = pd.read_csv('Data/values.csv')
values

In [None]:
data = pd.concat([values, labels.drop(['patient_id'], axis=1)], axis=1)
data

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data.thal.value_counts(normalize=True).mul(100).round(2)

In [None]:
## get all categorical (or descrete) features
cat_num_feat = [feat for feat in data.columns[1:] if (data[feat].dtype!='object') and (data[feat].nunique()<5)]
cat_num_feat

In [None]:
for i in cat_num_feat:
    print(data[i].value_counts(normalize=True).mul(100).round(2), '\n')

In [None]:
## get all numerical continous features
num_cont_feat = [feat for feat in data.columns[1:] if (data[feat].dtype!='object') and (feat not in cat_num_feat)]
num_cont_feat

In [None]:
data.describe()

In [None]:
data.thal.describe(include='O')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
import sweetviz as sv

In [None]:
report = sv.analyze(data)
report.show_html()

In [None]:
sb.countplot(x='thal', data=data)
plt.show()

In [None]:
plt.figure(figsize=(25, 20))
plt_num=1
for feat in cat_num_feat:
    if plt_num<=len(cat_num_feat):
        plt.subplot(3, 3, plt_num)
        sb.countplot(x=feat, data=data)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('count', fontsize=15)
        plt_num+=1
plt.show()

In [None]:
plt.figure(figsize=(25, 20))
plt_num=1
for feat in num_cont_feat:
    if plt_num<=len(num_cont_feat):
        plt.subplot(2, 3, plt_num)
        sb.histplot(x=feat, data=data)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('Count', fontsize=15)
        plt_num+=1
plt.show()

In [None]:
## Density plot of continous features

plt.figure(figsize=(25, 20))
plt_num=1
for feat in num_cont_feat:
    if plt_num<=len(num_cont_feat):
        plt.subplot(2, 3, plt_num)
        sb.distplot(data[feat])
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('Density', fontsize=15)
        plt_num+=1
plt.show()

In [None]:
sb.countplot(x='thal', hue='heart_disease_present', data=data)
plt.show()

In [None]:
plt.figure(figsize=(20, 25))
plt_num=1
for feat in cat_num_feat[:-1]:
    if plt_num<=len(cat_num_feat):
        plt.subplot(3, 3, plt_num)
        sb.countplot(x=feat, hue='heart_disease_present', data=data)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('count', fontsize=15)
        plt_num+=1
plt.show()

In [None]:
## multivariate categorical value based ploting for 'thal' cat features.

sb.catplot(x='heart_disease_present', col='thal', kind='count', data=data)
plt.show()

In [None]:
## multivariate categorical value based ploting for each categorical features.

for feat in cat_num_feat[:-1]:
    sb.catplot(x='heart_disease_present', col=feat, kind='count', data=data)

In [None]:
plt.figure(figsize=(20, 15))
plt_num=1
for feat in num_cont_feat:
    if plt_num<=len(num_cont_feat):
        plt.subplot(2, 3, plt_num)
        sb.histplot(x=feat, hue='heart_disease_present', data=data)
        plt.xlabel(feat, fontsize=15)
        plt.ylabel('Count', fontsize=15)
        plt_num+=1
plt.show()

In [None]:
## stripplot

plt.figure(figsize=(20, 15))
plt_num=1
for feat in num_cont_feat:
    if plt_num<=len(num_cont_feat):
        plt.subplot(2, 3, plt_num)
        sb.stripplot(x='heart_disease_present', y=feat, data=data)
        plt.xlabel('heart_disease_present', fontsize=15)
        plt.ylabel(feat, fontsize=15)
        plt_num+=1
plt.show()

In [None]:
## pairplot between all input features

sb.pairplot(data, hue='heart_disease_present')
plt.show()

In [None]:
data.drop(['patient_id'], axis=1, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(20, 15))
plt_num=1
for col in num_cont_feat:
    if plt_num<=len(num_cont_feat):
        plt.subplot(2, 3, plt_num)
        sb.boxplot(x=col, data=data)
        plt_num+=1
plt.show()

In [None]:
from scipy import stats

for feat in num_cont_feat:
    print(feat, 'has skewness: {}'.format(stats.skew(data[feat])))

In [None]:
for feat in num_cont_feat:
    print(feat, 'has kurtosis: {}'.format(stats.kurtosis(data[feat])))

In [None]:
data.resting_blood_pressure.describe()

In [None]:
iqr = stats.iqr(data.resting_blood_pressure, interpolation='midpoint')
print(iqr)
q1 = data.resting_blood_pressure.quantile(0.25)
q3 = data.resting_blood_pressure.quantile(0.75)
print(q1, ' ', q3)

In [None]:
lower_limit = q1-1.5*iqr
upper_limit = q3+1.5*iqr
print(lower_limit, ' ', upper_limit)

In [None]:
data.loc[data.resting_blood_pressure>upper_limit]

In [None]:
data.loc[data.resting_blood_pressure>upper_limit, 'resting_blood_pressure'] = data.resting_blood_pressure.median()

In [None]:
data.loc[data.resting_blood_pressure>upper_limit]

In [None]:
data.serum_cholesterol_mg_per_dl.describe()

In [None]:
iqr2 = stats.iqr(data.serum_cholesterol_mg_per_dl, interpolation='midpoint')
print(iqr2)
q1_2 = data.serum_cholesterol_mg_per_dl.quantile(0.25)
q3_2 = data.serum_cholesterol_mg_per_dl.quantile(0.75)
print(q1_2, ' ', q3_2)

In [None]:
lower_limit_2 = q1_2-1.5*iqr2 
upper_limit_2 = q3_2+1.5*iqr2 
print(lower_limit_2, ' ', upper_limit_2)

In [None]:
data.loc[data.serum_cholesterol_mg_per_dl>upper_limit_2]

In [None]:
data.loc[data.serum_cholesterol_mg_per_dl>upper_limit_2, 'serum_cholesterol_mg_per_dl'] = data.serum_cholesterol_mg_per_dl.median()
data.loc[data.serum_cholesterol_mg_per_dl>upper_limit_2]

In [None]:
data.oldpeak_eq_st_depression.describe()

In [None]:
iqr3 = stats.iqr(data.oldpeak_eq_st_depression, interpolation='midpoint')
print(iqr3)
q1_3 = data.oldpeak_eq_st_depression.quantile(0.25)
q3_3 = data.oldpeak_eq_st_depression.quantile(0.75)

In [None]:
lower_limit_3 = q1_3-1.5*iqr3
upper_limit_3 = q3_3+1.5*iqr3
print(lower_limit_3, ' ', upper_limit_3)

In [None]:
data.loc[data.oldpeak_eq_st_depression>upper_limit_3]

In [None]:
data.loc[data.oldpeak_eq_st_depression>upper_limit_3, 'oldpeak_eq_st_depression'] = data.oldpeak_eq_st_depression.median()
data.loc[data.oldpeak_eq_st_depression>upper_limit_3]

In [None]:
data.info()

In [None]:
data.thal.value_counts()

In [None]:
## manual label encoding of 'thal' feature.

data['thal'].replace({'fixed_defect':0, 'normal':1, 'reversible_defect':2}, inplace=True)

In [None]:
data.thal.value_counts()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [None]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, stratify=y)

In [None]:
print(y_train.value_counts(normalize=True).mul(100).round(2))
print(y_test.value_counts(normalize=True).mul(100).round(2))

In [None]:
sm = SMOTE()
X_train_sm, y_train_sm = sm.fit_resample(X_train_raw, y_train)
print('X_train size:', Counter(y_train))
print('X_train_sm size:', Counter(y_train_sm))

In [None]:
y_train_sm.value_counts(normalize=True).mul(100).round(2)

In [None]:
train_data = pd.concat([X_train_sm, y_train_sm], axis=1)
test_data = pd.concat([X_test_raw, y_test], axis=1)

In [None]:
scaler = StandardScaler()
train_data.iloc[:, :-1] = scaler.fit_transform(train_data.iloc[:, :-1])
test_data.iloc[:, :-1] = scaler.transform(test_data.iloc[:, :-1])

In [None]:
train_data

In [None]:
# Pearson Correlation
train_data.corr()

In [None]:
## heatmap plot

plt.figure(figsize=(20, 10))
sb.heatmap(train_data.corr(), annot=True, annot_kws={'size':14})

As we can see there is no much multicollinearty exist in data.

In [None]:
## Checking using ExtraTree Classifier

from sklearn.ensemble import ExtraTreesClassifier
check_feat = ExtraTreesClassifier()
check_feat.fit(train_data.iloc[:,:-1], train_data.iloc[:, -1])

In [None]:
print(check_feat.feature_importances_)
feat_imp = pd.Series(check_feat.feature_importances_, index=train_data.columns[:-1])
feat_imp.plot(kind='bar')
plt.show()

In [None]:
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

print(X_train.shape, y_train.shape, '\n', X_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, classification_report

In [None]:
models = []

models.append(('LR',LogisticRegression()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('SVC',SVC()))
models.append(('GNB',GaussianNB()))
models.append(('DT',DecisionTreeClassifier()))
models.append(('BC',BaggingClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('ABC',AdaBoostClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('XGBC',XGBClassifier()))
models.append(('MLPC',MLPClassifier()))

In [None]:
names=[]
results=[]
score = 'recall'
for name, model in models:
    pipeline = imbpipeline(steps=[['smote', SMOTE()],
                                 ['scaler', StandardScaler()],
                                 ['model', model]])
    kfold = StratifiedKFold(n_splits=5, shuffle=True)
    cv_result = cross_val_score(pipeline, X, y, scoring=score, cv=kfold)
    names.append(name)
    results.append(cv_result)
    msg = '%s: %f (%f)'%(name, cv_result.mean(), cv_result.std())
    print(msg, '\n', cv_result, '\n')

In [None]:
# algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison (recall score)')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)
model_gnb.score(X_train, y_train)

In [None]:
y_pred = model_gnb.predict(X_test)
model_gnb.score(X_test, y_test)

In [None]:
## recall_score
recall_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV

param = {
    'var_smoothing':np.logspace(0, -9, num=100)
}

model = GaussianNB()
kfold = StratifiedKFold(n_splits=5, shuffle=True)
grid_cv = GridSearchCV(model, param_grid=param, verbose=1, cv=kfold, n_jobs=-1)

grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

In [None]:
model_gnb_tnd = GaussianNB(var_smoothing=0.8111308307896871)
model_gnb_tnd.fit(X_train, y_train)
model_gnb_tnd.score(X_train, y_train)

In [None]:
## testing on tuned model

y_pred_tnd = model_gnb_tnd.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_tnd))

In [None]:
confusion_matrix(y_test, y_pred_tnd)