**DATA DICTONARY**
* Age: Age of the patient [years]
* Sex: Sex of the patient [M: Male, F: Female]
* ChestPainType: [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* RestingBP: Resting blood pressure [mm Hg]
* Cholesterol: Serum cholesterol [mm/dl]
* FastingBS: Fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* RestingECG: Resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
* MaxHR: Maximum heart rate achieved [Numeric value between 60 and 202]
* ExerciseAngina: Exercise-induced angina [Y: Yes, N: No]
* Oldpeak: ST [Numeric value measured in depression]
* ST_Slope: The slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* HeartDisease: Output class [1: heart disease, 0: Normal]

# Exploratory Data Analysis




In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import graphviz
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,StratifiedKFold,train_test_split,cross_validate,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,plot_tree

%matplotlib inline
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv('../input/heart-failure-prediction/heart.csv')

In [3]:
df.head()

In [4]:
df.shape

In [5]:
df.info()

In [6]:
df.isnull().sum()

In [7]:
df.duplicated().sum()

In [8]:
print(f"""Percentage of patient had a HeartDisease:  {round(df.HeartDisease.value_counts(normalize=True)[1]*100,2)} %  --> ({df.HeartDisease.value_counts()[1]} patient)
Percentage of patient did not have a HeartDisease: {round(df.HeartDisease.value_counts(normalize=True)[0]*100,2)} %  --> ({df.HeartDisease.value_counts()[0]} patient)""")

In [9]:
fig,ax = plt.subplots(figsize=(16,8))

df['Age'].plot(kind='hist',alpha=0.5,density=True,bins=20)
df['Age'].plot(kind='kde')

ax.set_xlabel('AGE')
ax.set_ylabel("Frequency")
ax.set_title('Age Distrbuition',size=20,pad=10)
ax.set_xlim(25,80)

quant25,quant50,quant75 = df['Age'].quantile(0.25),df['Age'].quantile(0.5),df['Age'].quantile(0.75)
quants = [[quant25,0.8,0.2],[quant50,0.95,0.4],[quant75,0.8,0.6]]
for i in quants:
    plt.axvline(x=i[0],alpha=i[1],ymax=i[2],linestyle=':',color='black')
    
ax.text(quant25, 0.012, "25th", size = 22, alpha = 0.85)
ax.text(quant50, 0.025, "50th", size = 25, alpha = 1)
ax.text(quant75, 0.035, "75th", size = 22, alpha = 0.85)

ax.grid(False)
plt.show()

In [10]:
plt.figure(figsize=(15,15))

plt.subplot(2,2,1)
df['RestingBP'].plot(kind='density',color='#007482')
plt.xlabel('RestingBP')
plt.ylabel("Frequency")
plt.grid(False)

plt.subplot(2,2,2)
df['Cholesterol'].plot(kind='density',color='orange')
plt.xlabel('Cholesterol')
plt.ylabel("Frequency")
plt.grid(False)

plt.subplot(2,2,3)
df['MaxHR'].plot(kind='density',color='red')
plt.xlabel('MaxHR')
plt.ylabel("Frequency")
plt.grid(False)

plt.subplot(2,2,4)
df['Oldpeak'].plot(kind='density')
plt.xlabel('Oldpeak')
plt.ylabel("Frequency")
plt.grid(False)

plt.show()

In [11]:
plt.figure(figsize=(15,15))

plt.subplot(3,4,1)
sns.countplot(df.Sex,hue=df.HeartDisease)
plt.title('SEX') 

plt.subplot(3,4,2)
sex = df.Sex.value_counts()
plt.pie(sex,explode=[0.1,0.1],autopct='%0.2f%%',shadow=True,labels=sex.index)
plt.title('Sex')
plt.axis('equal')

plt.subplot(3,4,3)
sns.countplot(df['ChestPainType'],hue=df.HeartDisease)
plt.title('ChestPainType') 

plt.subplot(3,4,4)
chest = df['ChestPainType'].value_counts()
plt.pie(chest,explode=[0.1,0.1,0.1,0.1],labels=chest.index,autopct='%.2f%%',shadow=True)
plt.title('ChestPainType')
plt.axis('equal')

plt.subplot(3,4,5)
sns.countplot(df['RestingECG'],hue=df.HeartDisease)
plt.title('RestingECG') 

plt.subplot(3,4,6)
ecg = df['RestingECG'].value_counts()
plt.pie(ecg,explode=[0.1,0.1,0.1],labels=ecg.index,autopct='%.2f%%',shadow=True)
plt.title('RestingECG')
plt.axis('equal')

plt.subplot(3,4,7)
sns.countplot(df.ST_Slope,hue=df.HeartDisease)
plt.title('ST_Slope') 

plt.subplot(3,4,8)
st = df.ST_Slope.value_counts()
plt.pie(st,explode=[0.1,0.1,0.1],autopct='%0.2f%%',shadow=True,labels=st.index)
plt.title('ST_Slope')
plt.axis('equal')

plt.subplot(3,4,9)
sns.countplot(df['FastingBS'],hue=df.HeartDisease)
plt.title('FastingBS') 

plt.subplot(3,4,10)
fast = df['FastingBS'].value_counts()
plt.pie(fast,explode=[0.1,0.1],autopct='%0.2f%%',shadow=True,labels=fast.index)
plt.title('FastingBS')
plt.axis('equal')

plt.subplot(3,4,11)
sns.countplot(df['ExerciseAngina'],hue=df.HeartDisease)
plt.title('ExerciseAngina')

plt.subplot(3,4,12)
ex =df['ExerciseAngina'].value_counts()
plt.pie(ex,explode=[0.1,0.1],autopct='%0.2f%%',shadow=True,labels=ex.index)
plt.title('ExerciseAngina')
plt.axis('equal')

plt.show()

In [12]:
sns.pairplot(df,hue='HeartDisease')
plt.show()

In [13]:
matrix = df.corr()
mask = np.triu(np.ones_like(matrix,dtype='bool'))

plt.figure(figsize=(22,12))
sns.heatmap(matrix,mask=mask,annot=True,center=0,fmt='.2f',square=True,cmap="coolwarm")
plt.show()

## preprocessing for Tree-Based Models

In [14]:
df_tree = df.apply(LabelEncoder().fit_transform)
df_tree.head()

In [15]:
X_tree,y_tree=df_tree.drop('HeartDisease',axis=1),df_tree['HeartDisease']
xtree_train, xtree_test, ytree_train, ytree_test = train_test_split(X_tree, y_tree, test_size=0.3)

## Preprocessing for None Tree-Based Models

In [16]:
str_col = [col for col in df.columns if df[col].dtype=='object']
df_nontree = pd.get_dummies(df,columns=str_col,drop_first=False)
df_nontree.head()

In [17]:
X_nontree,y_nontree=df_nontree.drop('HeartDisease',axis=1),df_nontree['HeartDisease']
xnontree_train, xnontree_test, ynontree_train, ynontree_test = train_test_split(X_nontree, y_nontree, test_size=0.3)

# Training our Machine Learning Model :

### NON-TREE BASED ALGORITHMS

***Logistic Regression***

In [18]:
kf = StratifiedKFold(n_splits=5)
log = make_pipeline(StandardScaler(),LogisticRegression())
log_score = cross_validate(log,X=xnontree_train,y=ynontree_train,cv=kf,scoring=['accuracy','roc_auc','recall','precision'])
log_score

***SVM***

In [19]:
kf = StratifiedKFold(n_splits=5)
svc = make_pipeline(StandardScaler(),SVC())
svc_para_grid = {'svc__kernel':['linear', 'poly', 'rbf']}
svc_grid = GridSearchCV(svc,param_grid=svc_para_grid,cv=kf,scoring='accuracy',return_train_score=True,refit=True)
svc_grid.fit(X=xnontree_train,y=ynontree_train)
svc_score=cross_validate(svc_grid,X=xtree_train,y=ytree_train,cv=kf,scoring=['accuracy','recall','precision','roc_auc'])
svc_score

***K-nearest Neighbors***

In [20]:
kf = StratifiedKFold(n_splits=5)
kn = make_pipeline(StandardScaler(),KNeighborsClassifier())
kn_para_grid = {'kneighborsclassifier__n_neighbors':[4,8,16,32,64,128]}
kn_grid = GridSearchCV(kn,param_grid=kn_para_grid,cv=kf,scoring='accuracy',return_train_score=True,refit=True)
kn_grid.fit(X=xnontree_train,y=ynontree_train)
kn_score=cross_validate(kn_grid,X=xtree_train,y=ytree_train,cv=kf,scoring=['accuracy','recall','precision','roc_auc'])
kn_score

### TREE BASED ALGORITHMS


***Decission tree Classifier***

In [21]:
kf = StratifiedKFold(n_splits=5)
dtree = DecisionTreeClassifier()
dtree_score=cross_validate(dtree,X=xtree_train,y=ytree_train,cv=kf,scoring=['accuracy','recall','precision','roc_auc'])
dtree_score

In [22]:
dtree = dtree.fit(xtree_train,ytree_train)
dot_data = export_graphviz(dtree,feature_names=xtree_train.columns,
                           class_names=['0','1'],filled=True,out_file=None)
graph = graphviz.Source(dot_data,format='png')
graph

***Random Forest Classifier***

In [23]:
kf = StratifiedKFold(n_splits=5)
rfor = RandomForestClassifier(n_estimators=200)
rfor_score = cross_validate(rfor,X=xtree_train,y=ytree_train,cv=kf,scoring=['accuracy','recall','precision','roc_auc'])
rfor_score

In [24]:
# Checking Feature importance 
rfor.fit(xtree_train,ytree_train)
importance = pd.Series(rfor.feature_importances_,index=xtree_train.columns).sort_values()

#initializing plot
ax = importance.plot.barh(color = '#007482', fontsize = 15)

#giving a title
ax.set(title = 'Feature importance')

#x-label
ax.set_ylabel('Features', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(20, 8)

#shwoing the plot
plt.show()

***XGBoost***

In [25]:
kf = StratifiedKFold(n_splits=5)
xgb = XGBClassifier()
xgb_score = cross_validate(xgb,X=xtree_train,y=ytree_train,cv=kf,scoring=['accuracy','recall','precision','roc_auc'])
xgb_score

In [26]:
fig, ax = plt.subplots(figsize=(30, 30))
xgb.fit(xtree_train,ytree_train)
plot_tree(xgb,num_trees=0,rankdir="LR",ax=ax)
plt.show()

In [27]:
log_score = pd.DataFrame(log_score).mean()
svc_score = pd.DataFrame(svc_score).mean()
kn_score = pd.DataFrame(kn_score).mean()
dtree_score = pd.DataFrame(dtree_score).mean()
rfor_score = pd.DataFrame(rfor_score).mean()
xgb_score = pd.DataFrame(xgb_score).mean()

models_score = pd.DataFrame(index=['LogisticRegression','SVC','KNN','DTree','RForest','XGBClassifier'],
                        data= [log_score,svc_score,kn_score,dtree_score,rfor_score,xgb_score])
models_score