In [90]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, precision_recall_curve, roc_auc_score

In [84]:
data = pd.read_csv('../dataset/heart.csv')

In [4]:
data.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1


In [5]:
# Check total size of the dataset
data.shape

(918, 12)

In [8]:
# Check missing
print(f'Any missing values {data.isnull().any().any()}')
data.isnull().sum()

Any missing values False


Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [9]:
# Check imbalanceness of target
data.HeartDisease.value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [36]:
# Feature/target separation
y = data['HeartDisease']
X = data.drop(['HeartDisease'], axis = 1)

In [37]:
#Encode categorical variables
discrete_features = X.select_dtypes(['object']).columns.tolist()
ordinal_encoder = OrdinalEncoder()
X_discrete = ordinal_encoder.fit_transform(X[discrete_features])
X_discrete = pd.DataFrame(X_discrete, columns = discrete_features)
X[discrete_features] = X_discrete

In [38]:
# Calculate MI for features and target
mi_score = mutual_info_classif(X = X, y = y, discrete_features = False)
mi_score_df = pd.DataFrame(mi_score, index = X.columns, columns = ['MI']).sort_values('MI', ascending = False)
mi_score_df

Unnamed: 0,MI
ST_Slope,0.205065
ChestPainType,0.154064
Oldpeak,0.143053
MaxHR,0.110842
ExerciseAngina,0.105182
Cholesterol,0.076944
FastingBS,0.049631
Age,0.046302
RestingECG,0.022812
RestingBP,0.018562


In [40]:
# Feature/target separation
y = data['HeartDisease']
X = data.drop(['HeartDisease'], axis = 1)

In [None]:
# Use pipeline in sklearn to setup 
ordinal_encoder = OrdinalEncoder()
scaler = MinMaxScaler()

preprocessor = ColumnTransformer(
    transformer = []
)

In [43]:
discrete_features = X.select_dtypes(['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

In [46]:
numerical_transformer = Pipeline(steps = [('scaler', MinMaxScaler())])
categorical_transfomer = Pipeline(steps = [('ode', OrdinalEncoder())])

In [47]:
col_tranformer = ColumnTransformer(transformers = [
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transfomer, discrete_features)
],
remainder = 'drop')

In [48]:
preprocessor = Pipeline(steps = [('col_tf', col_tranformer)])

In [57]:
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(random_state = 0, eval_metric = 'auc'))
])

In [58]:
param_grid = {
    'xgb__n_estimators':[100,200,300],
    'xgb__learning_rate':[0.01,0.05,0.1]
}

In [59]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X, y)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        Pipeline(steps=[('col_tf',
                                                         ColumnTransformer(transformers=[('num',
                                                                                          Pipeline(steps=[('scaler',
                                                                                                           MinMaxScaler())]),
                                                                                          ['Age',
                                                                                           'RestingBP',
                                                                                           'Cholesterol',
                                                                                           'FastingBS',
                                                                                           'MaxHR',
      

In [62]:
model= grid_search.best_estimator_

In [67]:
y_pred = model.predict_proba(X)

In [71]:
roc_auc_score(y, y_pred[:,1])

0.996951219512195

In [None]:
precision_recall_curve()

In [73]:
precision, recall, thr = precision_recall_curve(y, y_pred[:,1])

In [77]:
ordinal_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value='Unknown')

In [78]:
data.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1


In [87]:
# Manual CV split by stratefication
data_0 = data[data.HeartDisease == 0]
data_1 = data[data.HeartDisease == 1]

In [89]:
data_0 = data_0.sample(frac = 1)
data_1 = data_1.sample(frac = 1)

In [97]:
data_0_split = np.array_split(data_0, 5)
data_1_split = np.array_split(data_1, 5)

In [98]:
data_split = []
for data_0_part, data_1_part in zip(data_0_split, data_1_split):
    data_split.append(pd.concat([data_0_part, data_1_part], axis = 0))

In [104]:
data_split[0].HeartDisease.value_counts()

1    102
0     82
Name: HeartDisease, dtype: int64

In [107]:
K_fold = len(data_split)
for k in range(K_fold):
    test = data_split[k]
    train = pd.concat([d for i, d in enumerate(data_split) if i != k])
    print(test.shape)
    print(train.shape)

(184, 12)
(734, 12)
(184, 12)
(734, 12)
(184, 12)
(734, 12)
(183, 12)
(735, 12)
(183, 12)
(735, 12)


In [116]:
data.groupby('HeartDisease')['Age'].transform(max)

483    77
634    76
277    77
89     76
452    77
       ..
578    77
316    77
152    76
341    77
107    76
Name: Age, Length: 918, dtype: int64

In [117]:
data_sample = data.sample(frac = 0.8)

In [118]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
483,58,M,ASY,120,0,0,LVH,106,Y,1.5,Down,1
634,40,M,TA,140,199,0,Normal,178,Y,1.4,Up,0
277,52,M,ASY,170,223,0,Normal,126,Y,1.5,Flat,1
89,55,M,ASY,140,229,0,Normal,110,Y,0.5,Flat,0
452,60,M,ASY,140,281,0,ST,118,Y,1.5,Flat,1


In [121]:
data.index

Int64Index([483, 634, 277,  89, 452, 772, 491, 456, 312, 167,
            ...
            613,  65, 373, 157, 409, 578, 316, 152, 341, 107],
           dtype='int64', length=918)

In [122]:
data_rest = data.drop(data_sample.index)

In [124]:
data_rest.shape

(184, 12)