In [15]:
import pandas as pd
import numpy as np

import category_encoders as ce  # new lib for encoding categorical variables

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

from summarytools import dfSummary

In [3]:
heart_df = pd.read_csv('heart.csv')
dfSummary(heart_df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Age [int64],Mean (sd) : 53.5 (9.4) min < med < max: 28.0 < 54.0 < 77.0 IQR (CV) : 13.0 (5.7),50 distinct values,,0 (0.0%)
2,Sex [object],1. M 2. F,725 (79.0%) 193 (21.0%),,0 (0.0%)
3,ChestPainType [object],1. ASY 2. NAP 3. ATA 4. TA,496 (54.0%) 203 (22.1%) 173 (18.8%) 46 (5.0%),,0 (0.0%)
4,RestingBP [int64],Mean (sd) : 132.4 (18.5) min < med < max: 0.0 < 130.0 < 200.0 IQR (CV) : 20.0 (7.2),67 distinct values,,0 (0.0%)
5,Cholesterol [int64],Mean (sd) : 198.8 (109.4) min < med < max: 0.0 < 223.0 < 603.0 IQR (CV) : 93.8 (1.8),222 distinct values,,0 (0.0%)
6,FastingBS [int64],Mean (sd) : 0.2 (0.4) min < med < max: 0.0 < 0.0 < 1.0 IQR (CV) : 0.0 (0.6),2 distinct values,,0 (0.0%)
7,RestingECG [object],1. Normal 2. LVH 3. ST,552 (60.1%) 188 (20.5%) 178 (19.4%),,0 (0.0%)
8,MaxHR [int64],Mean (sd) : 136.8 (25.5) min < med < max: 60.0 < 138.0 < 202.0 IQR (CV) : 36.0 (5.4),119 distinct values,,0 (0.0%)
9,ExerciseAngina [object],1. N 2. Y,547 (59.6%) 371 (40.4%),,0 (0.0%)
10,Oldpeak [float64],Mean (sd) : 0.9 (1.1) min < med < max: -2.6 < 0.6 < 6.2 IQR (CV) : 1.5 (0.8),53 distinct values,,0 (0.0%)


In [None]:
X = heart_df.drop('HeartDisease', axis=1)
y = heart_df['HeartDisease']

X, y

Antes de fazer qualquer anteração com os dados (preprocessamento) com os dados para a construção do modelo, vamos antes separar eles em treino e validação.

Se separarmos depois de fazer o preprocessamento, preencher colunas vazias por exemplo, o nosso dado treino vai "ver" o dado de teste, e isso vai prejudicar o modelo.

In [27]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y) 

In [28]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_cols = [col for col in X_train.columns if X_train[col].dtype == 'int64']

cat_cols, num_cols

(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
 ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR'])

In [29]:
# usados para fazer transformações em tipos diferentes de colunas
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ce.OneHotEncoder())
])

transformer = ColumnTransformer([
    ("cat_transform", cat_pipe, cat_cols),
    ("num_transform", SimpleImputer(strategy='median'), num_cols)
])

In [31]:
X_train_transform = transformer.fit_transform(X_train, y_train)
X_valid_transform = transformer.transform(X_valid) 

In [35]:
tree = DecisionTreeClassifier()

In [37]:
tree.fit(X_train_transform, y_train)
predict = tree.predict(X_valid_transform)

In [38]:
import sklearn.metrics as skmetrics

In [41]:
print(f"Acurária: {round(skmetrics.accuracy_score(y_valid, predict) * 100, 2)}%")

Acurária: 76.96%
