# Application 1: Heart Disease Detection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets, preprocessing, metrics 
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import clear_output

In [3]:
columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
data = pd.read_csv('processed.cleveland.csv', header=None, names=columns)

In [6]:
# preprocessing

# Update NaN values with mean of the respective column values
data['ca'] = data.ca.fillna(data.ca.mean())
data['thal'] = data.thal.fillna(data.thal.mean())

# If the goal field is < 2, we treat that the sample belongs to a healthy person.
data['num'] = data.num.map({0: 0, 1: 0, 2: 1, 3: 1, 4: 1})
# 0 -  Healthy
# 1 -  Unhealthy

In [10]:
min_max_scaler = preprocessing.MinMaxScaler()

X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

# Normalize features
X = min_max_scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

### Logistic Regression

In [12]:
lr_classifier = LogisticRegression(penalty='l2', #Ridge
                                   tol=0.0001,
                                   fit_intercept=True, 
                                   random_state=None, 
                                   max_iter=100)

In [13]:
_ = lr_classifier.fit(X_train, Y_train)

LogisticRegression()

In [15]:
y_pred = lr_classifier.predict(X_test)

In [18]:
gnb_classifier = GaussianNB()

In [19]:
gnb_classifier.fit(X_train, Y_train)
y_pred = gnb_classifier.predict(X_test)

In [22]:
svc_classifier = SVC(C=1.0, 
                     kernel='rbf', 
                     degree=3, gamma='scale', 
                     coef0=0.0, 
                     shrinking=True, 
                     probability=False, 
                     tol=0.001, 
                     cache_size=200, 
                     class_weight=None, 
                     verbose=False,
                     max_iter=- 1, 
                     decision_function_shape='ovr', 
                     break_ties=False, 
                     random_state=None)

In [23]:
svc_classifier.fit(X_train, Y_train)
y_pred = svc_classifier.predict(X_test)

In [26]:
kn_classifier = KNeighborsClassifier(n_neighbors=5, 
                                     weights='uniform', 
                                     algorithm='auto', #used to identify nearest neighbours
                                     p=2, #euclidean_distance
                                     metric='minkowski')

In [27]:
kn_classifier.fit(X_train, Y_train)

KNeighborsClassifier()

In [29]:
y_pred = kn_classifier.predict(X_test)

### Linear Discriminant Analysis

In [32]:
lda_classifier = LinearDiscriminantAnalysis(solver='svd', 
                                            shrinkage=None, 
                                            priors=None, 
                                            n_components=None, 
                                            store_covariance=False, 
                                            tol=0.0001, 
                                            covariance_estimator=None
                                           )
# How many dimensions / features were used for the classification?
# If a subset was used, how was it decided?
# Can we see the mean and scatter of each feature used?

In [36]:
dt_classifier = DecisionTreeClassifier(criterion='gini', 
                                       splitter='best', 
                                       max_depth=None, 
                                       min_samples_split=2, 
                                       min_samples_leaf=1, 
                                       min_weight_fraction_leaf=0.0, 
                                       max_features=None, 
                                       random_state=None, 
                                       max_leaf_nodes=None, 
                                       min_impurity_decrease=0.0, 
                                       class_weight=None, 
                                       ccp_alpha=0.0
                                      )

In [37]:
_ = dt_classifier.fit(X_train, Y_train)

# Root
# Visualize the tree
# Depth of the tree?
# Gini Index of nodes?

DecisionTreeClassifier()

In [48]:
rf_classifier = RandomForestClassifier(n_estimators=100, 
                                       criterion='gini', 
                                       max_depth=None, 
                                       min_samples_split=2, 
                                       min_samples_leaf=1, 
                                       min_weight_fraction_leaf=0.0, 
                                       max_features='auto', 
                                       max_leaf_nodes=None, 
                                       min_impurity_decrease=0.0, 
                                       min_impurity_split=None, 
                                       bootstrap=True, 
                                       oob_score=False, 
                                       n_jobs=None, 
                                       random_state=None, 
                                       verbose=0, 
                                       warm_start=False, 
                                       class_weight=None, 
                                       ccp_alpha=0.0, 
                                       max_samples=None)

In [49]:
rf_classifier.fit(X_train, Y_train)

RandomForestClassifier()

In [55]:
lr_classifier.n_iter_

array([23])

In [57]:
print("Please enter the details of the person:")
age = widgets.Text(description="age")
sex = widgets.Text(description="sex")
cp = widgets.Text(description="cp")
trestbps = widgets.Text(description="trestbps")
chol = widgets.Text(description="chol")
fbs = widgets.Text(description="fbs")
restecg = widgets.Text(description="restecg")
thalach = widgets.Text(description="thalach")
exang = widgets.Text(description="exang")
oldpeak = widgets.Text(description="oldpeak")
slope = widgets.Text(description="slope")
ca = widgets.Text(description="ca")
thal = widgets.Text(description="thal")

In [58]:
display(age)
display(sex)
display(cp)
display(trestbps)
display(chol)
display(fbs)
display(restecg)
display(thalach)
display(exang)
display(oldpeak)
display(slope)
display(ca)
display(thal)

Text(value='', description='age')

Text(value='', description='sex')

Text(value='', description='cp')

Text(value='', description='trestbps')

Text(value='', description='chol')

Text(value='', description='fbs')

Text(value='', description='restecg')

Text(value='', description='thalach')

Text(value='', description='exang')

Text(value='', description='oldpeak')

Text(value='', description='slope')

Text(value='', description='ca')

Text(value='', description='thal')

In [59]:
algorithm = widgets.Dropdown(
    options = [('Logistic Regression', 'LR'), 
               ('Linear Discriminant Analysis ', 'LDA'), 
               ('Support Vector Machines', 'SVM'),
               ('K-Nearest Neighbors', 'KN'),
               ('Naive Bayes', 'NB'),
               ('Decision Trees', 'DT'),
               ('Random Forest', 'RF'),
              ],
    disabled = False,
)

print('Select Algorithm')
display(algorithm)

Select Algorithm


Dropdown(options=(('Logistic Regression', 'LR'), ('Linear Discriminant Analysis ', 'LDA'), ('Support Vector Ma…

In [60]:
prediction = widgets.Output()

button_predict = widgets.Button(description="Predict")

def on_button_predict_clicked(b):
    
    input_data = {}
    input_data['age'] = float(age.value)
    input_data['sex'] = float(sex.value)
    input_data['cp'] = float(cp.value)
    input_data['trestbps'] = float(trestbps.value)
    input_data['chol'] = float(chol.value)
    input_data['fbs'] = float(fbs.value)
    input_data['restecg'] = float(restecg.value)
    input_data['thalach'] = float(thalach.value)
    input_data['exang'] = float(exang.value)
    input_data['oldpeak'] = float(oldpeak.value)
    input_data['slope'] = float(slope.value)
    input_data['ca'] = float(ca.value)
    input_data['thal'] = float(thal.value)
    
    user_input = pd.DataFrame(input_data, columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal'], index=[0])
    
    selected_algorithm = algorithm.value
    
    if selected_algorithm == 'LR':
        classifier = lr_classifier
    elif selected_algorithm == 'LDA':
        classifier = lda_classifier
    elif selected_algorithm == 'SVM':
        classifier = svc_classifier        
    elif selected_algorithm == 'KN':
        classifier = kn_classifier
    elif selected_algorithm == 'NB':
        classifier = gnb_classifier
    elif selected_algorithm == 'DT':
        classifier = dt_classifier
    elif selected_algorithm == 'RF':
        classifier = rf_classifier
        
    with prediction:
        clear_output(True)
        print(f'Selected Algorithm = {selected_algorithm}')
        print(classifier.predict(user_input)[0])
        
button_predict.on_click(on_button_predict_clicked)

In [61]:
display(button_predict)
display(prediction)

Button(description='Predict', style=ButtonStyle())

Output()