# Classification Models

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model, metrics

In [None]:
#Read the data
df=pd.read_csv("heart.csv")
df.head()

In [None]:
# Dataset columns
df.columns

In [None]:
# Data column, type and null information, and missing values
df.info()

In [None]:
df.isnull().sum()

As we can see on df.info() and df.isnull().sum() there is no missing value inside df.

In [None]:
# Check the proportion of 0 and 1 in HeartDisease label
df['HeartDisease'].value_counts()

In [None]:
import seaborn as sns
sns.countplot(data = df, x='HeartDisease')

- Most of the people in our data are infected.
- Our target considered balanced target.

In [None]:
from sklearn.utils import resample

#create teo different dataframe of majority and minority class 
df_majority = df[(df['HeartDisease']== 1)] 
df_minority = df[(df['HeartDisease']== 0)] 

# upsample minority class
df_minority_upsampled = resample(df_minority,
                          replace=True,      # sample with replacement
                          n_samples=508,     # to match majority class
                          random_state=27)   # reproducible results

# combine majority and upsampled minority
df_upsampled = pd.concat([df_minority_upsampled, df_majority])

In [None]:
# check new class counts
df_upsampled['HeartDisease'].value_counts()

Our targets have been balanced

In [None]:
#Create a new dataFrame that contains features that correlate closely with 'HeartDisease'
#Correlatian Heatmap
colormap = plt.cm.RdBu
plt.figure(figsize=(20,15))
plt.title('SalePrice Correlation with 3 Features', y=1.05, size=15)
sns.heatmap(df.corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

We can see in Correlation_Heatmap that the 'MaxHR' and 'Cholesterol' features do not have a close correlation because coefficients close to 0 or < 0 are meaningless, so we don't need to use them. 

In [None]:
#To load a new DataFrame
df = df[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'FastingBS', 'RestingECG',
         'Oldpeak', 'ST_Slope', 'HeartDisease']]

In [None]:
df.head()

In [None]:
#One Hot Encoding
df = pd.get_dummies(df)
df.head()

In [None]:
df.to_csv('heart_data_encoding.csv.gz', index = False)

In [None]:
df.info()

It can be seen in df.head() with the one hot encoding method can represent or make category type data as binary vectors with integer values, 0 and 1, where all elements will be worth 0 except for one element that is worth 1, that is, the element that has the value of that category.

In [None]:
df.columns

In [None]:
# defining feature matrix(X) and response vector(y)
X = df.loc[:, df.columns != 'HeartDisease']
y = df['HeartDisease']

In [None]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=50)

In [None]:
##Train two models with X_train and y_train (use Hyperparameter Tuning for random forest)

In [None]:
## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(X_train, y_train)

In [None]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(random_state=50, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)

classifier_rf.fit(X_train, y_train)

In [None]:
## Hyperparameter Tuning for random forest

rf = RandomForestClassifier(random_state=50, n_jobs=-1)

In [None]:
#Create one variable (params) to deposit whatever we will try to do with the model
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

In [None]:
from sklearn.model_selection import GridSearchCV

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 5,
                           n_jobs=-1, verbose=1, scoring="accuracy")

grid_search.fit(X_train, y_train)

In [None]:
# To see best_score
grid_search.best_score_

In [None]:
# To see best_estimator (best_max_depth, best_min_samples_leaf)
rf_best = grid_search.best_estimator_
rf_best.fit(X_train, y_train)

In [None]:
# Save the model
import joblib
joblib.dump(lr, 'model_logisticregression.pkl')
joblib.dump(classifier_rf, 'model_classifier_rf.pkl')
joblib.dump(rf_best, 'model_classifier_rf_hypertuned.pkl')

In [None]:
##Evaluate the result with confusion matrix, classification report, and AUC

In [None]:
#For X_test predictions
# Logistic Regression
y_lr = lr.predict(X_test)

# Random Forest Awal
y_rf_before = classifier_rf.predict(X_test)

# Random Forest dengan Hyperparameter Tuning
y_rf_after = rf_best.predict(X_test)

In [None]:
#To display confusion_matrix results
from sklearn.metrics import confusion_matrix
print("Logistic Regression : \n", confusion_matrix(y_test, y_lr))
print("Random Forest Awal : \n", confusion_matrix(y_test, y_rf_before))
print("Random Forest dengan Hyperparameter Tuning: \n", confusion_matrix(y_test, y_rf_after))

In [None]:
##Evaluate the result with classification report

In [None]:
from sklearn.metrics import classification_report
print("Logistic Regression : \n\n", classification_report(y_test, y_lr))
print("Random Forest Awal : \n\n", classification_report(y_test, y_rf_before))
print("Random Forest dengan Hyperparameter Tuning: \n\n", classification_report(y_test, y_rf_after))

In [None]:
##Evaluate the result with AUC

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_lr, pos_label=1) # pos_label: positive label
print("Logistic Regression :", auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(y_test, y_rf_before, pos_label=1) # pos_label: positive label
print("Random Forest Awal :", auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(y_test, y_rf_after, pos_label=1) # pos_label: positive label
print("Random Forest dengan Hyperparameter Tuning:", auc(fpr, tpr))

In [None]:
## Which model is better at predicting Heart Disease? Interpret the reason.

A better model for predicting Heart Disease is 'Random Forest'

Reason:
After evalution to several models with 3 metrics (confusion matrix, classification report, and AUC), we can see that the prediction results with the 'Random Forest' model have the highest accuracy. In evaluating metrics with classification_report it can be seen that there are several combined metrics, to see how accurate our model is in predicting true positive and true negative heart disease (normal) then the matching metric is the 'recall' metric because the 'recall' metric shows the positive true rate of a model, the highest accuracy of the recall metric is in the prediction of the 'Random Forest' model of 86%. And in the evaluation with the AUC metric, it can be seen that the highest accuracy is in the prediction of the 'Random Forest' model of 85%, because the higher the AUC value, the better the model is in distinguishing between patients with heart disease and no heart disease (normal).