In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/heartdisease/heart_test.csv
/kaggle/input/heartdisease/heart_train.csv
/kaggle/input/heartdisease/heart_sample_submission.csv


# 1. Read data from csv files

In [2]:
# Read the data
X_full = pd.read_csv('/kaggle/input/heartdisease/heart_train.csv', index_col='id')
X_test = pd.read_csv('/kaggle/input/heartdisease/heart_test.csv', index_col='id')

X_full.dropna(axis=0, subset=['HeartDisease'], inplace=True)
y = X_full['HeartDisease']
X_full.drop(labels=['HeartDisease'], axis=1, inplace=True)

# 2. Examine data
Take an overview of what is the data look like

In [3]:
X_full

Unnamed: 0_level_0,Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,4,39,M,NAP,120,339,0,Normal,170,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...,...
638,638,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
639,639,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
640,640,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
641,641,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


# 3. Data cleaning & Features engineering
Separating features of object type and numerical type in order to prepare for encoding or process any missing value. Also, splitting the test data and validation data for further analysis.

In [4]:
# Get ready for one-hot encoding 
# Get good cardinalirty features and numerical columns
good_cardinality_cols = [col for col in X_full if X_full[col].nunique() < 10 and X_full[col].dtype=='object']
numerical_cols = [col for col in X_full if X_full[col].dtype=='int64' or X_full[col].dtype=='float64']

chosen_cols = good_cardinality_cols + numerical_cols
X_full=X_full[chosen_cols].copy()
X_test=X_test[chosen_cols].copy()

# Split training and validity test
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, test_size=0.2, train_size=0.8, random_state=0)
X_valid

Unnamed: 0_level_0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
637,F,ASY,Normal,Y,Flat,637,57,140,241,0,123,0.2
165,M,ASY,Normal,Y,Flat,165,54,200,198,0,142,2.0
467,F,NAP,Normal,N,Flat,467,44,118,242,0,149,0.3
311,M,ASY,ST,Y,Flat,311,64,144,0,0,122,1.0
432,M,ASY,LVH,Y,Flat,432,67,120,229,0,129,2.6
...,...,...,...,...,...,...,...,...,...,...,...,...
422,F,ATA,LVH,Y,Up,422,74,120,269,0,121,0.2
233,M,ASY,Normal,Y,Down,233,64,110,0,1,114,1.3
64,F,ATA,Normal,Y,Flat,64,53,140,216,0,142,2.0
439,F,NAP,LVH,N,Up,439,51,120,295,0,157,0.6


# 4. Build model
The model I am going to use is Decision Tree along with OneHotEncoder to encode that type 'object' feature. I packed all of them inside a pipeline for easier implementation

In [5]:
# Preprocess the data
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error


numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
#     ('imputer1', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
    ('imputer2', SimpleImputer(strategy='median')),
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, good_cardinality_cols),
    ('num', numerical_transformer, numerical_cols),
])

In [6]:
# Cross-validation to find the most optimal parameter for the model
from sklearn.model_selection import cross_val_score
def get_score(n_esti, max_leafs):
    pip = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(n_estimators=n_esti, max_leaf_nodes=max_leafs, random_state=0))
    ])
    scores = -1 * cross_val_score(pip, X_train, y_train, cv=50, scoring='neg_mean_absolute_error', verbose=False)
    return scores.mean()

Brute force to find the most optimal hyperparameter for DecisionTree model

In [7]:
optimal_n_estimators=-1
optimal_max_leaf_node=-1
optimal_scores=1000000
for n_estimators in range(50, 100, 50):
    for max_leaf_node in range(10, 100, 40):
        temp = get_score(n_estimators, max_leaf_node)
        if temp < optimal_scores:
            optimal_scores = temp
            optimal_n_estimators = n_estimators
            optimal_max_leaf_node = max_leaf_node
print(optimal_n_estimators, optimal_max_leaf_node, optimal_scores)

50 90 0.11563636363636362


In [8]:
# build model
pip = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators=50, max_leaf_nodes=50, random_state=0))
])
pip.fit(X_train, y_train)

In [9]:
def output(pred):
    output = pd.DataFrame({'id': X_test.index, 'output': pred})
    output.to_csv('submission.csv', index=False)

In [10]:
pred = pip.predict(X_test)
output(pred)

This model yields the accuracy of **85.06%** on the private data and **85.3%** on the public one. Which shows that this model is not likely to experience overfitting, however, underfitting might be the case.

# 5. Improve the model
The implementation below utilize XGBoost with some hyperparameter tunning

In [11]:
# Improve by using XGBoost
from xgboost import XGBClassifier
pip = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(n_estimators=500, learning_rate=0.05, n_jobs=4,  early_stopping_rounds=5))
])
fit_params = {
    'model__eval_set': [(pip.named_steps['preprocessor'].transform(X_valid), y_valid)],
    'model__verbose': False,
}
pip.fit(X_train, y_train, **fit_params)

In [12]:
pred = pip.predict(X_test)
output(pred)

This model trends to perform slightly better than the previous one with RandomForestClassifier with **85.8%** accuracy on the private data and **83.8%** on the public one.