In [22]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [9]:
df_train = df.sample(frac=0.8, random_state=42)  
df_test = df.drop(df_train.index)
df_validation = df_train.sample(frac=0.2, random_state=42)
df_train = df_train.drop(df_validation.index)

In [26]:
target = 'HeartDisease'
column_names = ['Age',
                'ChestPainType',
                'Cholesterol',
                'ExerciseAngina',
                'FastingBS',
                'MaxHR',
                'Oldpeak',
                'RestingBP',
                'RestingECG',
                'ST_Slope',
                'Sex']
cat_features = ['ChestPainType',
                'ExerciseAngina',
                'FastingBS',
                'RestingECG',
                'ST_Slope',
                'Sex']

train_pool = Pool(df_train[column_names], label=df_train[target], cat_features=cat_features,
                  feature_names=column_names)
val_pool = Pool(df_validation[column_names], label=df_validation[target], cat_features=cat_features,
                feature_names=column_names)
model = CatBoostClassifier(iterations=100, depth=3, learning_rate=0.1, verbose=True, auto_class_weights= "Balanced")
model.fit(train_pool)
model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=50)

0:	learn: 0.6656361	total: 447us	remaining: 44.3ms
1:	learn: 0.6389838	total: 1.07ms	remaining: 52.5ms
2:	learn: 0.6075974	total: 1.4ms	remaining: 45.2ms
3:	learn: 0.5785666	total: 1.59ms	remaining: 38.3ms
4:	learn: 0.5575105	total: 1.9ms	remaining: 36.1ms
5:	learn: 0.5392288	total: 2.21ms	remaining: 34.7ms
6:	learn: 0.5222478	total: 2.62ms	remaining: 34.8ms
7:	learn: 0.5048071	total: 2.8ms	remaining: 32.2ms
8:	learn: 0.4894182	total: 3ms	remaining: 30.4ms
9:	learn: 0.4737321	total: 3.4ms	remaining: 30.6ms
10:	learn: 0.4593097	total: 3.59ms	remaining: 29ms
11:	learn: 0.4470935	total: 3.76ms	remaining: 27.6ms
12:	learn: 0.4366140	total: 3.94ms	remaining: 26.4ms
13:	learn: 0.4277497	total: 4.1ms	remaining: 25.2ms
14:	learn: 0.4202309	total: 5.01ms	remaining: 28.4ms
15:	learn: 0.4116467	total: 5.21ms	remaining: 27.3ms
16:	learn: 0.4061686	total: 5.37ms	remaining: 26.2ms
17:	learn: 0.4016444	total: 5.51ms	remaining: 25.1ms
18:	learn: 0.3950921	total: 6.54ms	remaining: 27.9ms
19:	learn: 0.3

<catboost.core.CatBoostClassifier at 0x16c85a190>

In [27]:
predictions = model.predict(df_test[column_names])
print(classification_report(df_test[target], predictions))
print(confusion_matrix(df_test[target], predictions))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86        92
           1       0.83      0.93      0.88        92

    accuracy                           0.87       184
   macro avg       0.88      0.87      0.87       184
weighted avg       0.88      0.87      0.87       184

[[74 18]
 [ 6 86]]
