In [193]:
from argparse import Namespace

In [198]:
np = Namespace(**{'input_data_path': '../data/raw/heart.csv', 
        'serialize_model': True, })


In [199]:
np.input_data_path

'../data/raw/heart.csv'

In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import roc_curve, auc 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 

from pandas.api.types import CategoricalDtype

# import eli5 
# from eli5.sklearn import PermutationImportance
# import shap 
# from pdpbox import pdp, info_plots 
np.random.seed(123) 

pd.options.mode.chained_assignment = None     # hide any pandas warnings
import warnings
warnings.filterwarnings("ignore")

In [128]:
pd.options.display.max_columns = 250

In [129]:
DATA_FOLDER = '../data/'     # slash at the end!

It's a clean, easy to understand set of data. However, the meaning of some of the column headers are not obvious. Here's what they mean,

- age: The person's age in years
- sex: The person's sex (1 = male, 0 = female)
- cp: The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic)
- trestbps: The person's resting blood pressure (mm Hg on admission to the hospital)
- chol: The person's cholesterol measurement in mg/dl
- fbs: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
- restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
- thalach: The person's maximum heart rate achieved
- exang: Exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot. See more here)
- slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping)
- ca: The number of major vessels (0-3)
- thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)
- target: Heart disease (0 = no, 1 = yes)


In [168]:
df = pd.read_csv(DATA_FOLDER + 'heart.csv')

In [131]:
set(df.columns)

{'age',
 'ca',
 'chol',
 'cp',
 'exang',
 'fbs',
 'oldpeak',
 'restecg',
 'sex',
 'slope',
 'target',
 'thal',
 'thalach',
 'trestbps'}

In [132]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [133]:
df = df.rename(columns={
    'age': 'age', 
    'sex': 'sex', 
    'cp': 'chest_pain_type', 
    'trestbps': 'resting_blood_pressure', 
    'chol': 'cholesterol', 
    'fbs': 'fasting_blood_sugar', 
    'restecg': 'rest_ecg', 
    'thalach': 'max_heart_rate_achieved', 
    'exang': 'exercise_induced_angina', 
    'oldpeak': 'st_depression', 
    'slope': 'st_slope', 
    'ca': 'num_major_vessels', 
    'thal': 'thalassemia', 
    'target': 'target', 
})

In [134]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 0] = 'no'
df['chest_pain_type'][df['chest_pain_type'] == 1] = 'typical_angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'atypical_angina'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'non_anginal_pain'
df['chest_pain_type'][df['chest_pain_type'] == 4] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower_120mg_ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater_120mg_ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'wave_abnormality'
df['rest_ecg'][df['rest_ecg'] == 2] = 'hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 0] = 'no'
df['st_slope'][df['st_slope'] == 1] = 'upsloping'
df['st_slope'][df['st_slope'] == 2] = 'flat'
df['st_slope'][df['st_slope'] == 3] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 0] = 'no'
df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed_defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable_defect'

In [135]:
# df['sex'] = df['sex'].astype('object')
# df['chest_pain_type'] = df['chest_pain_type'].astype('object')
# df['fasting_blood_sugar'] = df['fasting_blood_sugar'].astype('object')
# df['rest_ecg'] = df['rest_ecg'].astype('object')
# df['exercise_induced_angina'] = df['exercise_induced_angina'].astype('object')
# df['st_slope'] = df['st_slope'].astype('object')
# df['thalassemia'] = df['thalassemia'].astype('object')

In [136]:
cat_cols = [
    'sex',
    'chest_pain_type',
    'fasting_blood_sugar',
    'rest_ecg',
    'exercise_induced_angina',
    'st_slope',
    'thalassemia',
]

for c in cat_cols:
    print(c)
    print(df[c].astype('category').cat.categories.tolist())
    print()

sex
['female', 'male']

chest_pain_type
['atypical_angina', 'no', 'non_anginal_pain', 'typical_angina']

fasting_blood_sugar
['greater_120mg_ml', 'lower_120mg_ml']

rest_ecg
['hypertrophy', 'normal', 'wave_abnormality']

exercise_induced_angina
['no', 'yes']

st_slope
['flat', 'no', 'upsloping']

thalassemia
['fixed_defect', 'no', 'normal', 'reversable_defect']



In [141]:
df['sex'] = df['sex'].fillna('male').astype(
    CategoricalDtype(['female', 'male'])
)
df['chest_pain_type'] = df['chest_pain_type'].fillna('no').astype(
    CategoricalDtype(['atypical_angina', 'no', 'non_anginal_pain', 'typical_angina'])
)
df['fasting_blood_sugar'] = df['fasting_blood_sugar'].fillna('low').astype(
    CategoricalDtype(['high', 'low'])
)
df['rest_ecg'] = df['rest_ecg'].fillna('normal').astype(
    CategoricalDtype(['hypertrophy', 'normal', 'wave_abnormality'])
)
df['exercise_induced_angina'] = df['exercise_induced_angina'].fillna('no').astype(
    CategoricalDtype(['no', 'yes'])
)
df['st_slope'] = df['st_slope'].fillna('no').astype(
    CategoricalDtype(['flat', 'no', 'upsloping', 'downsloping'])
)
df['thalassemia'] = df['thalassemia'].fillna('no').astype(
    CategoricalDtype(['fixed_defect', 'no', 'normal', 'reversable_defect'])
)


In [142]:
df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope,num_major_vessels,thalassemia,target
0,63,male,non_anginal_pain,145,233,low,normal,150,no,2.3,no,0,normal,1
1,37,male,atypical_angina,130,250,low,wave_abnormality,187,no,3.5,no,0,fixed_defect,1
2,41,female,typical_angina,130,204,low,normal,172,no,1.4,flat,0,fixed_defect,1
3,56,male,typical_angina,120,236,low,wave_abnormality,178,no,0.8,flat,0,fixed_defect,1
4,57,female,no,120,354,low,wave_abnormality,163,yes,0.6,flat,0,fixed_defect,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,female,no,140,241,low,wave_abnormality,123,yes,0.2,upsloping,0,reversable_defect,0
299,45,male,non_anginal_pain,110,264,low,wave_abnormality,132,no,1.2,upsloping,0,reversable_defect,0
300,68,male,no,144,193,low,wave_abnormality,141,no,3.4,upsloping,2,reversable_defect,0
301,57,male,no,130,131,low,wave_abnormality,115,yes,1.2,upsloping,1,reversable_defect,0


In [143]:
dff = pd.get_dummies(df, drop_first=False)

In [144]:
dff.head(10)

Unnamed: 0,age,resting_blood_pressure,cholesterol,max_heart_rate_achieved,st_depression,num_major_vessels,target,sex_female,sex_male,chest_pain_type_atypical_angina,chest_pain_type_no,chest_pain_type_non_anginal_pain,chest_pain_type_typical_angina,fasting_blood_sugar_high,fasting_blood_sugar_low,rest_ecg_hypertrophy,rest_ecg_normal,rest_ecg_wave_abnormality,exercise_induced_angina_no,exercise_induced_angina_yes,st_slope_flat,st_slope_no,st_slope_upsloping,st_slope_downsloping,thalassemia_fixed_defect,thalassemia_no,thalassemia_normal,thalassemia_reversable_defect
0,63,145,233,150,2.3,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0
1,37,130,250,187,3.5,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0
2,41,130,204,172,1.4,0,1,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0
3,56,120,236,178,0.8,0,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0
4,57,120,354,163,0.6,0,1,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,0
5,57,140,192,148,0.4,0,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0
6,56,140,294,153,1.3,0,1,1,0,0,0,0,1,0,1,0,1,0,1,0,0,0,1,0,1,0,0,0
7,44,120,263,173,0.0,0,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,1
8,52,172,199,162,0.5,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,1
9,57,150,168,174,1.6,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0


In [186]:
# transformer = PreprocessRawData()
# transformer.fit_transform(df)

In [145]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('target', 1), df['target'], test_size=0.2, random_state=10
)

In [49]:
model = RandomForestClassifier(max_depth=5)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)

In [200]:
    params = {
        'input_data_path': 'data/raw/heart.csv', 
        'serialize_model': True, 
        'output_model_path': 'models/model.pkl', 
        'splitting_strategy': 'holdout', 
        'splitting_params': {
            'val_size': 0.1, 
            'random_state': 3
        }, 
        'predict_raw_data_path': 'data/raw/example_for_predict.csv', 
        'predict_out_data_path': 'data/output/example_predicts.csv', 
        'model': {
            'name': 'rf', 
            'train_params': {
                'train_features': 'default', 
                'model_params': {
                    'n_estimators': 120, 
                    'random_state': 10, 
                    'max_depth': None
                }
            }
        }
    }
    cfg = yaml.dump(params)

In [201]:
cfg

'input_data_path: data/raw/heart.csv\nmodel:\n  name: rf\n  train_params:\n    model_params:\n      max_depth: null\n      n_estimators: 120\n      random_state: 10\n    train_features: default\noutput_model_path: models/model.pkl\npredict_out_data_path: data/output/example_predicts.csv\npredict_raw_data_path: data/raw/example_for_predict.csv\nserialize_model: true\nsplitting_params:\n  random_state: 3\n  val_size: 0.1\nsplitting_strategy: holdout\n'

In [204]:
cfg = cfg = """input_data_path: data/raw/heart.csv
             model:
               name: rf
               train_params:
                 model_params:
                   max_depth: null
                   n_estimators: 120
                   random_state: 10
                 train_features: default
             output_model_path: models/model.pkl
             predict_out_data_path: data/output/example_predicts.csv
             predict_raw_data_path: data/raw/example_for_predict.csv
             serialize_model: true
             splitting_params:
               random_state: 3
               val_size: 0.1
             splitting_strategy: holdout"""

In [205]:
print(cfg)

input_data_path: data/raw/heart.csv
             model:
               name: rf
               train_params:
                 model_params:
                   max_depth: null
                   n_estimators: 120
                   random_state: 10
                 train_features: default
             output_model_path: models/model.pkl
             predict_out_data_path: data/output/example_predicts.csv
             predict_raw_data_path: data/raw/example_for_predict.csv
             serialize_model: true
             splitting_params:
               random_state: 3
               val_size: 0.1
             splitting_strategy: holdout


In [210]:
msg = 'ROC AUC train: 1.00000 val: 0.95614'


In [211]:
import re


In [None]:
re.findall('ROC AUC train')