# Prediction of heart diesease

<b>Natthapol Jinavanich</b>

_Use Case identification:_ find any trends in heart data to predict certain cardiovascular events or find any clear indications of heart health.

Data Source: https://www.kaggle.com/ronitf/heart-disease-uci (Initial Data Source: https://archive.ics.uci.edu/ml/datasets/Heart+Disease)

Data Attribute Information:

- age
- sex
- chest pain type (4 values)
- resting blood pressure
- serum cholestoral in mg/dl
- fasting blood sugar > 120 mg/dl
- resting electrocardiographic results (values 0,1,2)
- maximum heart rate achieved
- exercise induced angina
- oldpeak = ST depression induced by exercise relative to rest
- the slope of the peak exercise ST segment
- number of major vessels (0-3) colored by flourosopy
- thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:
import sys
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as st
%matplotlib inline
import types
from botocore.client import Config
import ibm_boto3

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# models
import sklearn
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier

# NN models
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

print('Python: {}'.format(sys.version))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Keras: {}'.format(keras.__version__))

In [None]:
#Import data
df = pd.read_csv("heart.csv")
df.head()

##  Data Exploration

In [None]:
# Data quality check
df.isnull().sum()
df.info()

In [None]:
# Check the number of unique values in target variable
df['target'].nunique()
# Caunt the unique values in target variable
df['target'].value_counts()
# Check the distribution of the unique values in target variable
df.groupby('sex')['target'].value_counts()

In [None]:
#Histogram plot
df.hist(figsize = (12, 12))
plt.show()

In [None]:
# Taking all discreate values for sepearate analysis (boxplot)
fig, axes = plt.subplots(ncols=8,figsize=(20,3))
sns.boxplot(x='target',y='ca',data=df, palette='winter', ax=axes[0])
axes[0].set_title('Chest Pain vs Target distribution')

sns.boxplot(x='target',y='cp' ,data=df, palette='winter', ax=axes[1])
axes[1].set_title("fbs vs Target ")

sns.boxplot(x='target',y='exang' ,data=df, palette='winter', ax=axes[2])
axes[2].set_title("restecg vs Target distribution")

sns.boxplot(x='target',y='fbs' ,data=df, palette='winter', ax=axes[3])
axes[3].set_title("thalach vs Target ")

sns.boxplot(x='target',y='restecg' ,data=df, palette='winter', ax=axes[4])
axes[4].set_title("slope vs Target distribution")

sns.boxplot(x='target',y='sex' ,data=df, palette='winter', ax=axes[5])
axes[5].set_title("chol vs Target ")

sns.boxplot(x='target',y='slope' ,data=df, palette='winter', ax=axes[6])
axes[6].set_title("thal vs Target ")

sns.boxplot(x='target',y='thal' ,data=df, palette='winter', ax=axes[7])
axes[7].set_title("thal vs Target ")

In [None]:
rest_data = ['age', 'chol', 'oldpeak', 'thalach', 'trestbps', 'target' ]
sns.pairplot(df[rest_data], kind='scatter', diag_kind='hist')
plt.show()

In [None]:
#Pair plot by group target
rest_data_vs_target = df[rest_data]
sns.pairplot(rest_data_vs_target, hue="target")

In [None]:
# Checking correlation between target variable and each attributes
correlation = df.corr()
correlation['target'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap of Heart Disease Dataset')
a = sns.heatmap(correlation, cmap='YlGnBu', square=True, annot=True, fmt='.2f', linecolor='green')
a.set_xticklabels(a.get_xticklabels(), rotation=90)
a.set_yticklabels(a.get_yticklabels(), rotation=30)           
plt.show()

## Modeling (Machine Learning)

In [None]:
#Let's Start train and test different models
target_name = 'target'
data_target = df[target_name]
data = df.drop([target_name], axis=1)

In [None]:
#Train/Test-80/20
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

Logistic Regression

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(train, target)
acc_log = round(logreg.score(train, target) * 100, 2)
acc_log

In [None]:
acc_test_log = round(logreg.score(test, target_test) * 100, 2)
acc_test_log

Support Vector Machines

In [None]:
# Support Vector Machines 

svc = SVC()
svc.fit(train, target)
acc_svc = round(svc.score(train, target)*100, 2)
acc_svc

In [None]:
acc_test_svc = round(svc.score(test, target_test) * 100, 2)
acc_test_svc

Linear Support Vector Machines (SVC)

In [None]:
# Linear Support Vector Machines (SVC)

linear_svc = LinearSVC(dual=False)  # dual=False when n_samples > n_features.
linear_svc.fit(train, target)
acc_linear_svc = round(linear_svc.score(train, target) * 100, 2)
acc_linear_svc

In [None]:
acc_test_linear_svc = round(linear_svc.score(test, target_test) * 100, 2)
acc_test_linear_svc

k-Nearest Neighbors algorithm (KNN)

In [None]:
# k-Nearest Neighbors algorithm (KNN)

knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': [2, 3]}, cv=10).fit(train, target)
acc_knn = round(knn.score(train, target) * 100, 2)
print(acc_knn, knn.best_params_)

In [None]:
acc_test_knn = round(knn.score(test, target_test) * 100, 2)
acc_test_knn

Gaussian Naive Bayes

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(train, target)
acc_gaussian = round(gaussian.score(train, target) * 100, 2)
acc_gaussian

In [None]:
acc_test_gaussian = round(gaussian.score(test, target_test) * 100, 2)
acc_test_gaussian

Perceptron

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(train, target)
acc_perceptron = round(perceptron.score(train, target) * 100, 2)
acc_perceptron

In [None]:
acc_test_perceptron = round(perceptron.score(test, target_test) * 100, 2)
acc_test_perceptron

Stochastic Gradient Descent

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(train, target)
acc_sgd = round(sgd.score(train, target) * 100, 2)
acc_sgd

In [None]:
acc_test_sgd = round(perceptron.score(test, target_test) * 100, 2)
acc_test_sgd

Decision Tree Classifier

In [None]:
# Decision Tree Classifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train, target)
acc_decision_tree = round(decision_tree.score(train, target) * 100, 2)
acc_decision_tree

In [None]:
acc_test_decision_tree = round(decision_tree.score(test, target_test) * 100, 2)
acc_test_decision_tree

Random Forest

In [None]:
# Random Forest

random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300]}, cv=5).fit(train, target)
random_forest.fit(train, target)
acc_random_forest = round(random_forest.score(train, target) * 100, 2)
print(acc_random_forest,random_forest.best_params_)

In [None]:
acc_test_random_forest = round(random_forest.score(test, target_test) * 100, 2)
acc_test_random_forest

Ridge Classifier

In [None]:
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(train, target)
acc_ridge_classifier = round(ridge_classifier.score(train, target) * 100, 2)
acc_ridge_classifier

In [None]:
acc_test_ridge_classifier = round(ridge_classifier.score(test, target_test) * 100, 2)
acc_test_ridge_classifier

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machines', 'Linear SVC', 
              'k-Nearest Neighbors', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Decision Tree Classifier', 'Random Forest', 
              'RidgeClassifier'],
    
    'Score_train': [acc_log, acc_svc, acc_linear_svc, 
                    acc_knn, acc_gaussian, acc_perceptron, 
                    acc_sgd, acc_decision_tree, acc_random_forest, 
                    acc_ridge_classifier],
    
    'Score_test': [acc_test_log, acc_test_svc, acc_test_linear_svc, 
                   acc_test_knn, acc_test_gaussian, acc_test_perceptron, 
                   acc_test_sgd, acc_test_decision_tree, acc_test_random_forest, 
                   acc_test_ridge_classifier]
                    })

In [None]:
models.sort_values(by=['Score_test', 'Score_train'], ascending=False)

## Modeling (Deep learning)

In [None]:
# create X and Y datasets for training
from sklearn import model_selection
X = np.array(df.drop(['target'], 1))
y = np.array(df['target'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

In [None]:
# convert the data to categorical labels

from keras.utils.np_utils import to_categorical

Y_train = to_categorical(y_train, num_classes=None)
Y_test = to_categorical(y_test, num_classes=None)
print (Y_train.shape)
print (Y_train[:10])

In [None]:
# Building and Training the Neural Network

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# define a function to build the keras model
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    # compile model
    adam = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

model = create_model()

print(model.summary())

In [None]:
# fit the model to the training data
history=model.fit(X_train, Y_train, validation_data=(X_test, Y_test),epochs=200, batch_size=10, verbose = 10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Model accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

In [None]:
# Model Losss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

In [None]:
# convert into binary classification problem - heart disease or no heart disease
Y_train_binary = y_train.copy()
Y_test_binary = y_test.copy()

Y_train_binary[Y_train_binary > 0] = 1
Y_test_binary[Y_test_binary > 0] = 1

print(Y_train_binary[:20])

In [None]:
# define a new keras model for binary classification
def create_binary_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    adam = Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

binary_model = create_binary_model()

print(binary_model.summary())

In [None]:
history=binary_model.fit(X_train, Y_train_binary, validation_data=(X_test, Y_test_binary), epochs=200, batch_size=10, verbose = 10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Model accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

In [None]:
# Model Losss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

In [None]:
# generate classification report using predictions for categorical model
from sklearn.metrics import classification_report, accuracy_score

categorical_pred = np.argmax(model.predict(X_test), axis=1)

print('Results for Categorical Model')
print(accuracy_score(y_test, categorical_pred))
print(classification_report(y_test, categorical_pred))

In [None]:
# generate classification report using predictions for categorical model
from sklearn.metrics import classification_report, accuracy_score
# generate classification report using predictions for binary model 
binary_pred = np.round(binary_model.predict(X_test)).astype(int)

print('Results for Binary Model')
print(accuracy_score(Y_test_binary, binary_pred))
print(classification_report(Y_test_binary, binary_pred))