In [2]:
#Import relevent libraries

import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler,PowerTransformer,FunctionTransformer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression,RidgeClassifier,LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h1 id="heading"><span style="font-size:36px;">Load Data:</span></h1>

In [3]:
#Import csv data into dataframe object
data=pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')

<span style="font-size:18px;">Display first 5 rows and column info</span>

In [4]:
#Show overview of data structure
display(data.head())

#Look for missing values
print(data.info())

There are no missing values in the dataset

<span style="font-size:36px;">EDA:</span>

<span style="font-size:18px;">Separate Numerical and Categorical Columns</span>
* This is helpful in plotting and analyzing the two types of data separately

In [5]:
##Select low cardinality categorical columns and numerical columns separately
cat_col = [coln for coln in data.columns if data[coln].nunique() < 10 and
               data[coln].dtype == 'object']
num_col = [coln for coln in data.columns if data[coln].dtype in ['int64','float64']]

In [6]:
#Create separate dataframe (DF) for numerical and catagorical columns
data_num=data[num_col].copy()
data_cat=data[cat_col].copy()

In [7]:
#Create function to show description of numerical data
def describedata(data):      #data === dataframe name
    item=data.columns.tolist()
    describelist=[]
    for cols in item:
        x=data[cols].describe()
        describelist.append(x)
    print(describelist)
    
describedata(data_num)

* RestingBP and Cholesterol have nonphysical values (0's) which are outliers: Investigate further

In [8]:
#Show outliers within the data and see how to resolve
print('RestingBP rows which are 0')
print(data_num[data_num['RestingBP']==0])
print('Cholesterol rows which are 0')
print(data_num[data_num['Cholesterol']==0])

* We can remove index 449 as it contains both problems  
* Cholesterol has a large percentage of 0's (172/918 rows), we can try to remove them and see how it affects the accuracy score

In [9]:
#Drop the problem row and create new dataset without "Cholesterol" outliers
data=data.drop([449])
data_drop_cholesterol=data[data['Cholesterol']>0]



<span style="font-size:36px;">Numerical Plots and Correlations:</span>

In [10]:
#Overview of the correlations between all variables
sns.heatmap(data.corr(),annot=True,cmap="YlGnBu")

*Negative Correlations with Heart Disease*
* Cholesterol
* MaxHR
* RestingBP

*Positive Correlations with Heart Disease*
* Oldpeak
* Age
* FastingBS


In [11]:
#Full print of correlation matrix
corr_matrix=data.corr()
print(corr_matrix)

In [12]:
#Selects highly correlated  independent features 
#it will select the first feature that is correlated with anything other feature

def correlation(df,threshold):     
    #df == dataframe, threshold == decimal value of correlation threshold
    coll_corr=set() #set of all the names of correlated columns 
    corr_matrix=df.corr()
    for i in range (len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])> threshold:
                colname=corr_matrix.columns[i] #getting the name of columns
                coll_corr.add(colname)
    return coll_corr

In [13]:
#calling the fuction to check for correlated features
corr_feature = correlation(data.iloc[:,:-1],0.85)
len(set(corr_feature))

* There are zero highly correlated variables
* Let's try to see if any variables are multicolinear with a lower threshold

In [14]:
#Detect multicolinearity with variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["feature"] = data_num.columns
vif_data["VIF"] = [variance_inflation_factor(data_num.values, i) 
                   for i in range(len(data_num.columns))]
for i in range(len(vif_data.values)):
    if vif_data.iloc[i,1]>10.0:
        print(vif_data.iloc[i,0]+" has a large VIF")
        
vif_data = vif_data.sort_values('VIF',ascending=False)
display(vif_data)

* Age-RestingBP-MaxHR are multicolinear with a large VIF>10
* We can try to leave **MaxHR** and discard others as MaxHR has highest correlation with target variable 

In [15]:
#Create new df without 'Age' and 'RestingBP' features 
data_drop_colinear = data.drop(['Age','RestingBP'],axis=1)

#Create new df without 'Age' and 'RestingBP' features  and with cholesterol empty rows dropped
data_drop_both=data_drop_colinear[data_drop_colinear['Cholesterol']>0]

<span style="font-size:36px;">Catagorical Plots and Correlation with Target Variable</span>

In [16]:
#Create function to plot numerical distributions

def num_plot(data, col):   
    #data == dataframe to plot, col = feature to plot
    fig = px.histogram(data, x=col, color="HeartDisease",
                       marginal="box")
    fig.update_layout(height=300, width=500, showlegend=True)
    fig.update_traces(marker_line_width=1,marker_line_color="black")
    fig.show()
 
 
#plot the data
for col in num_col:
    num_plot(data,col)

* There are skewed features which can be normalized: Oldpeak, MaxHR(maybe)

<span style="font-size:36px;">Analyze Catagorical Data</span>

In [17]:
#Function that groups Heart Disease with different training variables and calculates mean count
def mean_groupby(Feature,TargetVariable):
    datagroups=data.groupby(Feature)[TargetVariable].mean().sort_values(ascending=False)*100
    return datagroups

#Plot histogram of training variables and color by positive/negative Heart Disease
def plot_hist_cat(Feature,TargetVariable):
    fig = px.histogram(data_frame = data,
             x = Feature,
             color=TargetVariable, title=Feature + " vs " + TargetVariable,
             pattern_shape_sequence=['x'], width=500,height=300)
    fig.show()

#Run last two functions for each column
for col in cat_col:
    temp = mean_groupby(col,'HeartDisease')
    print(temp)
    print("")
    plot_hist_cat(col,'HeartDisease')



* Twice as many men get heart disease as women
* ASY ChestPain is strong indicator of heart disease
* Exercise Angina is strong indicator of heart disease
* Up slope is strong indicator of no heart disease 

In [18]:
#let find the skewed col and fix them
    
skew_limit=0.75 # limit for skewed col
skew_vals=data[num_col].drop('FastingBS', axis=1).skew()
skew_col=skew_vals[abs(skew_vals)>skew_limit].sort_values(ascending =False)
print('Skewed Feature:',skew_col)

Oldpeak is a heavily skewed feature that can be renormalized

<span style="font-size:36px;">Split into Train/Test Sets</span>

In [19]:
#create function to split data
y=data.HeartDisease
X=data.drop(['HeartDisease'],axis=1)
X_dropchol=data_drop_cholesterol.drop(['HeartDisease'],axis=1)  #dropped cholesterol=0 rows 
y_dropchol=data_drop_cholesterol.HeartDisease
X_dropcolin=data_drop_colinear.drop(['HeartDisease'],axis=1)  #dropped multicolinear columns
X_dropboth=data_drop_both.drop(['HeartDisease'],axis=1) #dropped cholesterol=0 rows and multicolinear columns

#Create function to split into training and test data
def splitdata(data,y):
    X_train_all,X_test_all,y_train,y_test=train_test_split(data,y,train_size=0.8, test_size=0.2,
                                                                random_state=0)
    return X_train_all,X_test_all,y_train,y_test

In [20]:
#Split all data to test
X_train_all,X_test_all,y_train,y_test=splitdata(X,y)
X_train_chol,X_test_chol,y_train_chol,y_test_chol=splitdata(X_dropchol,y_dropchol)
X_train_colin,X_test_colin,y_train_colin,y_test_colin=splitdata(X_dropcolin,y)
X_train_both,X_test_both,y_train_both,y_test_both=splitdata(X_dropboth,y_dropchol)

<span style="font-size:36px;">Preprocess Data</span>

Fix skewed feature

In [21]:
#fix skew using power transformer to make data more gaussian like 
datalist=[X_train_all,X_test_all,X_train_chol,X_test_chol,X_train_colin,X_test_colin,X_train_both,X_test_both] #list of datasets
#cat_col_colin = ['Sex', 'ChestPainType','RestingECG',
                #'ExerciseAngina', 'ST_Slope']
cat_col_list=[cat_col,cat_col,cat_col,cat_col,cat_col,cat_col,cat_col,cat_col] #list of category columns
skewfeature=['Oldpeak']

def fixskew(skewfeature,datalist):
    pt=PowerTransformer(standardize=False)
    datalistnew=[]
    length=len(datalist)-1
    for i in np.arange(0,length,2):
        tempdata=datalist[i]
        tempdata[skewfeature]=pt.fit_transform(tempdata[skewfeature])
        datalistnew.append(tempdata)
        tempdata1=datalist[i+1]
        tempdata1[skewfeature]=pt.transform(tempdata1[skewfeature])
        datalistnew.append(tempdata1)
    return datalistnew


datalistnew=fixskew(skewfeature,datalist) #contains all split data with fixed skew


    

Encode Catagorical Data

In [22]:
#Function to Encode catagorical data into columns of 1's and 0's
def EncodeCat(data_list,cat_col_list):
    encode_data=[]
    length=len(data_list)-1
    for i in np.arange(0,length,2):
        temptrain=data_list[i]
        temptest=data_list[i+1]
        tempcol=cat_col_list[i]
        temptrain=pd.get_dummies(temptrain,columns=tempcol,drop_first=True)
        temptest=pd.get_dummies(temptest,columns=tempcol,drop_first=True)
        encode_data.append(temptrain)
        encode_data.append(temptest)
    return encode_data



In [23]:
#Create list of encoded data
encode_data_list = EncodeCat(datalistnew,cat_col_list)

Scale all data columns with standard scaler
* Centers data based on the mean
* Normalizes data with standard deviation

In [24]:
#Use standard scaler to normalize data 
def ScalerFunc(datalist):  #returns list of processed data
    sc=StandardScaler()
    length=len(datalist)-1
    data_prefinal=[]
    for i in np.arange(0,length,2):
        tempdatatrain=datalist[i]
        tempdatatest=datalist[i+1]
        X_train_scaled=sc.fit_transform(tempdatatrain)
        X_test_scaled=sc.transform(tempdatatest)
        X_train = pd.DataFrame(X_train_scaled, columns = tempdatatrain.columns[:])
        X_test = pd.DataFrame(X_test_scaled, columns = tempdatatest.columns[:])
        data_prefinal.append(X_train)
        data_prefinal.append(X_test)
    return data_prefinal
 
#list of processed data
data_prefinal=ScalerFunc(encode_data_list)

In [25]:
y_list = [y_train,y_test,y_train_chol,y_test_chol,y_train,y_test,y_train_chol,y_test_chol]
#data_final=DropCols(data_prefinal,drop_cols)

def models_score(models,datalist,y_list):# (model is dict, datalist contains x values, y list)   
    #returns list of scores of each model
    scores = {}
    length = len(datalist)-1
    model_list=[]
    for i in np.arange(0,length,2):
        X_train=datalist[i]
        X_test=datalist[i+1]
        y_train = y_list[i]
        y_test = y_list[i+1]
        print("")
        print("")
        print("")
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred=model.predict(X_test) 
            scores[name] = model.score(X_test,y_test)
            #printing the model name and accuracy !!!!!
            print("Model name: ",model)
            print("Accuracy :--->>",accuracy_score(y_test,y_pred))
            print()
            print(classification_report(y_test,y_pred))
            print("Confusion matrix:--->>\n",confusion_matrix(y_test,y_pred)) 
            print("\n<<<<------------------------------------------------------------->>>>\n")
           
   
        model_scores = pd.DataFrame(scores, index=['Score']).transpose()
        model_scores = model_scores.sort_values('Score',ascending=False)
        model_list.append(model_scores)
    return model_list

In [26]:
#Dictionary of models to test
models = {"LogisticRegression":LogisticRegression(solver="liblinear"),
          "SVC":SVC(C= 1.0,gamma= 0.05,kernel='rbf'),
          "DecisionTree":DecisionTreeClassifier(criterion='gini',max_depth=5,max_features='auto',splitter='random'),
          "AdaBoost":AdaBoostClassifier(algorithm='SAMME',learning_rate= 0.1,n_estimators=150),
          "GradiantBoost":GradientBoostingClassifier(criterion='friedman_mse',learning_rate=0.05,loss='exponential',n_estimators=100),
         "RandomForest":RandomForestClassifier(criterion='gini',n_estimators=150),
         "XgBoost": XGBClassifier(learning_rate=0.1,n_estimators=150),
         "KNeighborsClassifier":KNeighborsClassifier(n_neighbors=7)}

In [27]:
#Run all models on various datasets and print results
model_scores = models_score(models,data_prefinal,y_list)

Prints accuracy score of each model set

In [28]:
print("Model which includes all training elements")
temp_model_scores=model_scores[0]
temp_model_scores=temp_model_scores.reset_index().rename({"index":"Algorithms"}, axis = 1)
temp_model_scores.style.bar()
display(temp_model_scores)
print('')

print("Model which excludes empty 'Cholesterol' rows")
temp_model_scores1=model_scores[1]
temp_model_scores1=temp_model_scores1.reset_index().rename({"index":"Algorithms"}, axis = 1)
temp_model_scores1.style.bar()
display(temp_model_scores1)
print('')

print("Model which excludes colinear training variables")
temp_model_scores2=model_scores[2]
temp_model_scores2=temp_model_scores2.reset_index().rename({"index":"Algorithms"}, axis = 1)
temp_model_scores2.style.bar()
display(temp_model_scores2)
print('')

print("Model which excludes colinear training variables and empty cholesterol values")
temp_model_scores3=model_scores[3]
temp_model_scores3=temp_model_scores2.reset_index().rename({"index":"Algorithms"}, axis = 1)
temp_model_scores3.style.bar()
display(temp_model_scores2)
print('')


* ADA boost on the dataset with Cholesterol=0 rows removed had best score ~91%
* The dataset with colinear features and Cholesterol=0 rows removed had best average score

In [31]:
#Plot bar graph of accuracy score of each model for best dataset

fig = px.bar(data_frame = temp_model_scores1,
             x="Algorithms",
             y="Score",
             color="Algorithms", title = "<b>Models Score without Cholesterol 0's</b>", template = 'plotly_dark')

fig.update_layout(bargap=0.2)

fig.show()