## 1. Load Libraries

In [11]:
# Data manipulation libraries
import pandas as pd
import numpy as np

##### Scikit Learn modules needed for Logistic Regression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder,MinMaxScaler , StandardScaler

## Below packages are needed for Hyper Parameter Tuning of an Algorithm in Scikit Learn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

### 2. Load Data

In [2]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

In [3]:
print(f"Size of data: {df.shape}")

Size of data: (299, 13)


In [5]:
# Blanks cells
df.isna().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

### 3. Split Data

In [7]:
df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [9]:
X = df.loc[:,['high_blood_pressure','platelets','sex','smoking','diabetes']]
print(f"Shape of Inputdata : {X.shape}")
Y = df.loc[:,"DEATH_EVENT"]
print(f"Shape of Target : {Y.shape}")

Shape of Inputdata : (299, 5)
Shape of Target : (299,)


In [10]:
# Train & Test split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.10,
                                                    random_state=21)

print('Shape of Training Xs:{}'.format(x_train.shape))
print('Shape of Test Xs:{}'.format(x_test.shape))
print('Shape of Training y:{}'.format(y_train.shape))
print('Shape of Test y:{}'.format(y_test.shape))

Shape of Training Xs:(269, 5)
Shape of Test Xs:(30, 5)
Shape of Training y:(269,)
Shape of Test y:(30,)


### 4. Build Pipeline

In [12]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = X.columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

# categorical_features = 
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state= 42))])

### 5. Test model

In [14]:
clf.fit(x_train,y_train)
print(f"Accuracy of model on test data: {clf.score(x_test, y_test)}")

Accuracy of model on test data: 0.5


In [15]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler()],
    'classifier__criterion': ["gini","entropy"],
    'classifier__max_features': ["auto","sqrt","log2"],
    'classifier__max_depth':[10,50,100],
    'classifier__n_estimators':[10,50,150,200]
}

grid_search = GridSearchCV(clf, param_grid, cv=2, iid=False,verbose = 1,n_jobs= -1)
grid_search.fit(x_train, y_train)

print(("best Model from grid search: %.3f"
       % grid_search.score(x_test, y_test)))
# Print your best combination of hyper parameters
print("Optimum setting of hyperparameters:................")
grid_search.best_params_

Fitting 2 folds for each of 288 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min


best Model from grid search: 0.500
Optimum setting of hyperparameters:................


[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:  1.9min finished


{'classifier__criterion': 'gini',
 'classifier__max_depth': 10,
 'classifier__max_features': 'auto',
 'classifier__n_estimators': 50,
 'preprocessor__num__imputer__strategy': 'mean',
 'preprocessor__num__scaler': StandardScaler()}