In [2]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [3]:
path = 'gdrive/MyDrive/MADE_ML_PROD_2021_Hw_01_Prod_Ready_Example'

#downloaded from https://www.kaggle.com/ronitf/heart-disease-uci
!unzip $path/archive.zip

Archive:  gdrive/MyDrive/MADE_ML_PROD_2021_Hw_01_Prod_Ready_Example/archive.zip
  inflating: heart.csv               


In [4]:
import pandas as pd

In [6]:
df = pd.read_csv('heart.csv')
df.shape

(303, 14)

In [21]:
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure',
              'cholesterol', 'fasting_blood_sugar', 'rest_ecg',
              'max_heart_rate_achieved', 'exercise_induced_angina',
              'st_depression', 'st_slope', 'num_major_vessels',
              'thalassemia', 'target']

# better naming for categorial features

df.loc[df['sex'] == 0, 'sex'] = 'female'
df.loc[df['sex'] == 1, 'sex'] = 'male'

df.loc[df['chest_pain_type'] == 0, 'chest_pain_type'] = 'unknown angina'
df.loc[df['chest_pain_type'] == 1, 'chest_pain_type'] = 'typical angina'
df.loc[df['chest_pain_type'] == 2, 'chest_pain_type'] = 'atypical angina'
df.loc[df['chest_pain_type'] == 3, 'chest_pain_type'] = 'non-anginal pain'
df.loc[df['chest_pain_type'] == 4, 'chest_pain_type'] = 'asymptomatic'

df.loc[df['fasting_blood_sugar'] == 0, 'fasting_blood_sugar'] = 'lower than 120mg/ml'
df.loc[df['fasting_blood_sugar'] == 1, 'fasting_blood_sugar'] = 'greater than 120mg/ml'

df.loc[df['rest_ecg'] == 0, 'rest_ecg'] = 'normal'
df.loc[df['rest_ecg'] == 1, 'rest_ecg'] = 'ST-T wave abnormality'
df.loc[df['rest_ecg'] == 2, 'rest_ecg'] = 'left ventricular hypertrophy'

df.loc[df['exercise_induced_angina'] == 0, 'exercise_induced_angina'] = 'no'
df.loc[df['exercise_induced_angina'] == 1, 'exercise_induced_angina'] = 'yes'

df.loc[df['st_slope'] == 0, 'st_slope'] = 'unknown'
df.loc[df['st_slope'] == 1, 'st_slope'] = 'upsloping'
df.loc[df['st_slope'] == 2, 'st_slope'] = 'flat'
df.loc[df['st_slope'] == 3, 'st_slope'] = 'downsloping'

df.loc[df['thalassemia'] == 0, 'thalassemia'] = 'unknown'
df.loc[df['thalassemia'] == 1, 'thalassemia'] = 'normal'
df.loc[df['thalassemia'] == 2, 'thalassemia'] = 'fixed defect'
df.loc[df['thalassemia'] == 3, 'thalassemia'] = 'reversable defect'

categorial_features = ['sex', 'chest_pain_type', 'fasting_blood_sugar',
                       'rest_ecg', 'exercise_induced_angina',
                       'st_slope', 'thalassemia']

numerical_features = set(df.columns) - set(categorial_features) - set(['target'])
numerical_features = list(numerical_features)

In [14]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope,num_major_vessels,thalassemia,target
0,63,male,non-anginal pain,145,233,greater than 120mg/ml,normal,150,no,2.3,unknown,0,normal,1
1,37,male,atypical angina,130,250,lower than 120mg/ml,ST-T wave abnormality,187,no,3.5,unknown,0,fixed defect,1
2,41,female,typical angina,130,204,lower than 120mg/ml,normal,172,no,1.4,flat,0,fixed defect,1
3,56,male,typical angina,120,236,lower than 120mg/ml,ST-T wave abnormality,178,no,0.8,flat,0,fixed defect,1
4,57,female,unknown angina,120,354,lower than 120mg/ml,ST-T wave abnormality,163,yes,0.6,flat,0,fixed defect,1


In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

feature_generation = ColumnTransformer(
    transformers=[
        ('OneHot', OneHotEncoder(), categorial_features),
        ('Norma', MinMaxScaler(), numerical_features)
    ],
    remainder='drop',
    sparse_threshold=1
)

pipe = Pipeline(
    verbose=True,
    steps=[
        ('feature_generation', feature_generation),
        ('clf', RandomForestClassifier())
    ]
)

parameters = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [5, 7, 9, 11, 13],
    'clf__random_state': [90520]
}

search = GridSearchCV(pipe, parameters, n_jobs=-1)

In [44]:
search.fit(df.drop('target', 1), df['target'])

[Pipeline]  (step 1 of 2) Processing feature_generation, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('feature_generation',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=1,
                                                          transformer_weights=None,
                                                          transformers=[('OneHot',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop=None,
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                             

In [45]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.832):
{'clf__max_depth': 13, 'clf__n_estimators': 50, 'clf__random_state': 90520}
