### Data Collection and Processing

In [1]:
# loading the csv data to a Pandas DataFrame
import pandas as pd
heart_data = pd.read_csv('heart.csv')

In [2]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [4]:
# number of rows and columns in the dataset
heart_data.shape

(303, 14)

In [5]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
# checking for missing values
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
# checking the distribution of Target Variable
heart_data['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

### Model Training

In [8]:
from pycaret.classification import *

# Set up the classification experiment
clf_setup = setup(data=heart_data, target='target', preprocess='scale')

Unnamed: 0,Description,Value
0,Session id,2665
1,Target,target
2,Target type,Binary
3,Original data shape,"(303, 14)"
4,Transformed data shape,"(303, 14)"
5,Transformed train set shape,"(212, 14)"
6,Transformed test set shape,"(91, 14)"
7,Numeric features,13
8,Preprocess,scale
9,Imputation type,simple


#### Base Model

Looking for the best base model

In [9]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8022,0.8541,0.8697,0.7897,0.8254,0.5983,0.6071,0.252
xgboost,Extreme Gradient Boosting,0.7874,0.8518,0.8258,0.799,0.8085,0.5691,0.5766,0.01
ridge,Ridge Classifier,0.7831,0.0,0.8697,0.7678,0.8126,0.5581,0.5708,0.004
lda,Linear Discriminant Analysis,0.7831,0.8597,0.8697,0.7678,0.8126,0.5581,0.5708,0.004
nb,Naive Bayes,0.7734,0.8603,0.828,0.7769,0.799,0.5414,0.5471,0.003
et,Extra Trees Classifier,0.769,0.8768,0.7917,0.7932,0.7868,0.534,0.5434,0.02
rf,Random Forest Classifier,0.7639,0.8716,0.8091,0.7792,0.7863,0.5232,0.5374,0.023
lightgbm,Light Gradient Boosting Machine,0.7543,0.8665,0.7909,0.7672,0.7761,0.5035,0.5089,0.077
qda,Quadratic Discriminant Analysis,0.75,0.8431,0.7508,0.7826,0.7599,0.5014,0.509,0.004
ada,Ada Boost Classifier,0.74,0.7998,0.8083,0.75,0.7741,0.4704,0.4781,0.011


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Logistic Regression has the best performance

In [10]:
lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8182,0.8667,0.9167,0.7857,0.8462,0.6271,0.6383
1,0.7273,0.825,0.75,0.75,0.75,0.45,0.45
2,0.8095,0.8519,0.8333,0.8333,0.8333,0.6111,0.6111
3,0.9048,0.9537,1.0,0.8571,0.9231,0.8,0.8165
4,0.8095,0.9167,0.8333,0.8333,0.8333,0.6111,0.6111
5,0.7143,0.7636,0.8182,0.6923,0.75,0.422,0.4301
6,0.9524,0.9636,1.0,0.9167,0.9565,0.9041,0.9083
7,0.7619,0.7909,0.9091,0.7143,0.8,0.5161,0.5394
8,0.7619,0.7909,0.7273,0.8,0.7619,0.5249,0.5273
9,0.7619,0.8182,0.9091,0.7143,0.8,0.5161,0.5394


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

#### Bagging

In [11]:
bagged_lr = ensemble_model(lr, method = 'Bagging')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8182,0.8833,0.9167,0.7857,0.8462,0.6271,0.6383
1,0.7727,0.85,0.8333,0.7692,0.8,0.5378,0.5401
2,0.8571,0.8796,0.9167,0.8462,0.88,0.7042,0.7077
3,0.8095,0.9352,1.0,0.75,0.8571,0.5882,0.6455
4,0.7619,0.9167,0.75,0.8182,0.7826,0.5205,0.523
5,0.7143,0.7545,0.8182,0.6923,0.75,0.422,0.4301
6,0.9524,0.9727,1.0,0.9167,0.9565,0.9041,0.9083
7,0.7143,0.7727,0.9091,0.6667,0.7692,0.4167,0.4523
8,0.7143,0.7909,0.7273,0.7273,0.7273,0.4273,0.4273
9,0.7619,0.8182,0.9091,0.7143,0.8,0.5161,0.5394


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

#### Evaluation

Predicting Label and Score (probability of predicted class) using the trained model on the holdout set.

In [12]:
pred_holdout = predict_model(bagged_lr)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9011,0.9712,0.92,0.902,0.9109,0.7998,0.8


#### Calibrating
This function calibrates the probability of a given estimator using Platt’s method or ‘isotonic’ (which is a non-parametric approach). The output of this function is a score grid with CV scores by fold.

In [13]:
calibrated_model = calibrate_model(bagged_lr)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8182,0.8667,0.9167,0.7857,0.8462,0.6271,0.6383
1,0.7727,0.8167,0.8333,0.7692,0.8,0.5378,0.5401
2,0.9048,0.8704,1.0,0.8571,0.9231,0.8,0.8165
3,0.8095,0.9537,1.0,0.75,0.8571,0.5882,0.6455
4,0.8095,0.9074,0.8333,0.8333,0.8333,0.6111,0.6111
5,0.7143,0.7455,0.8182,0.6923,0.75,0.422,0.4301
6,0.9524,0.9636,1.0,0.9167,0.9565,0.9041,0.9083
7,0.8095,0.7818,1.0,0.7333,0.8462,0.6111,0.6633
8,0.7619,0.7909,0.7273,0.8,0.7619,0.5249,0.5273
9,0.7619,0.8273,0.9091,0.7143,0.8,0.5161,0.5394


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [14]:
pred = predict_model(calibrated_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9011,0.9741,0.92,0.902,0.9109,0.7998,0.8


### Saving the trained model

In [15]:
import pickle

In [16]:
filename = 'heart_disease_model.sav'
pickle.dump(calibrated_model, open(filename, 'wb'))