In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [37]:
data = pd.read_csv('../data/heart-disease.csv')
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [38]:
X = data.drop('target', axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [39]:
y = data['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [41]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

y_preds = classifier.predict(X_test)
y_preds

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [42]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(accuracy_score(y_test, y_preds))
print()
print(confusion_matrix(y_test, y_preds))
print()
print(classification_report(y_test, y_preds))

0.8524590163934426

[[16  6]
 [ 3 36]]

              precision    recall  f1-score   support

           0       0.84      0.73      0.78        22
           1       0.86      0.92      0.89        39

    accuracy                           0.85        61
   macro avg       0.85      0.83      0.83        61
weighted avg       0.85      0.85      0.85        61



In [43]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(RandomForestClassifier(n_estimators=100), X, y, cv=10)
cross_val_scores

array([0.90322581, 0.80645161, 0.83870968, 0.93548387, 0.90322581,
       0.8       , 0.83333333, 0.86666667, 0.75862069, 0.82758621])

In [44]:
np.mean(cross_val_scores)

0.8473303670745272

### Hyperparameters Tuning

In [45]:
classifier.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [46]:
### Rebuild the prediction model

# Shuffle the data
data = data.sample(frac=1)
X = data.drop('target', axis=1)
y = data['target']

In [47]:
# 70% of data for training
# 15% of data for testing
# 15% of data for validation
train_split = round(.7 * len(data))
valid_split = round(train_split + 0.15 * len(data))
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

In [48]:
len(X_train), len(X_valid), len(X_test)

(212, 45, 46)

### Baseline model (first model by hand)

In [49]:
# Instantiate and train the model
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Predict the validation data
y_preds = classifier.predict(X_valid)



In [50]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Create a function to display the performance
def evaluate_model(y_test, y_preds):
    print(accuracy_score(y_test, y_preds))
    print()
    print(confusion_matrix(y_test, y_preds))
    print()
    print(classification_report(y_test, y_preds))

In [51]:
evaluate_model(y_valid, y_preds)

0.8

[[18  2]
 [ 7 18]]

              precision    recall  f1-score   support

           0       0.72      0.90      0.80        20
           1       0.90      0.72      0.80        25

    accuracy                           0.80        45
   macro avg       0.81      0.81      0.80        45
weighted avg       0.82      0.80      0.80        45



### Second model (improving model by hand)

In [52]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
# Predict the validation data
y_preds = classifier.predict(X_valid)
evaluate_model(y_valid, y_preds)

0.8444444444444444

[[18  2]
 [ 5 20]]

              precision    recall  f1-score   support

           0       0.78      0.90      0.84        20
           1       0.91      0.80      0.85        25

    accuracy                           0.84        45
   macro avg       0.85      0.85      0.84        45
weighted avg       0.85      0.84      0.84        45



### Third model (improving by using RandomizedSearchCV)

In [53]:
from sklearn.model_selection import RandomizedSearchCV

# Check the hyperparameters of the model first
classifier.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [54]:
hyperparameters = {'n_estimators': [10, 100, 200, 500, 1000, 12000],
                   'max_depth': [None, 5, 10, 20, 30],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_split': [2, 4, 6],
                   'min_samples_leaf': [1, 2, 4]
                  }

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
X_train.shape, X_test.shape

((242, 13), (61, 13))

In [55]:
classifier = RandomForestClassifier(n_jobs=-1)
rs_clf = RandomizedSearchCV(estimator=classifier,
                            param_distributions=hyperparameters,
                            cv=10,
                            verbose=2,
                            n_iter=10)
rs_clf.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   2.0s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.3s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.2s
[CV] n_estimators=200, min_samples_split=6, min_samp

[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30, total=   0.1s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30, total=   0.1s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30, total=   0.1s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30, total=   0.1s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=30, total=   0.1s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1

[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.3s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.3s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.4s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.3s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.3s
[CV] n_estimators=500, min_samples_split=2, min_samp

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   37.1s finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=-1,

In [56]:
rs_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 6,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 5}

In [57]:
y_preds = rs_clf.predict(X_test)
y_preds

array([1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1], dtype=int64)

In [58]:
evaluate_model(y_test, y_preds)

0.8032786885245902

[[18  5]
 [ 7 31]]

              precision    recall  f1-score   support

           0       0.72      0.78      0.75        23
           1       0.86      0.82      0.84        38

    accuracy                           0.80        61
   macro avg       0.79      0.80      0.79        61
weighted avg       0.81      0.80      0.80        61



In [62]:
#### Fourth model (improving using GridSearchCV)
# Only run the Grid Search on a limitied number of hyperparameters

from sklearn.model_selection import GridSearchCV

hyperparameters = {'n_estimators': [10, 100, 200],
                   'max_depth': [None, 5, 10],
                   'min_samples_split': [2, 4, 6],
                  }

grid_search = GridSearchCV(RandomForestClassifier(100), hyperparameters, cv=10, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 27 candidates, totalling 270 fits
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 ............
[CV]  max_depth=None, min_samples_split=2, n_estimators=10, total=   0.0s
[CV] max_depth=None, min_samples_split=2, n_estimators=10 .........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=2, n_estim

[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estimators=100 ...........
[CV]  max_depth=None, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=None, min_samples_split=6, n_estim

[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............
[CV]  max_depth=5, min_samples_split=4, n_estimators=100, total=   0.1s
[CV] max_depth=5, min_samples_split=4, n_estimators=100 ..............

[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=2, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=2, n_estimators=100 ......

[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 .............
[CV]  max_depth=10, min_samples_split=6, n_estimators=100, total=   0.1s
[CV] max_depth=10, min_samples_split=6, n_estimators=100 ......

[Parallel(n_jobs=1)]: Done 270 out of 270 | elapsed:   18.3s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [63]:
grid_search.best_params_

{'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 100}

In [64]:
y_preds = grid_search.predict(X_test)

In [65]:
evaluate_model(y_test, y_preds)

0.8360655737704918

[[16  7]
 [ 3 35]]

              precision    recall  f1-score   support

           0       0.84      0.70      0.76        23
           1       0.83      0.92      0.88        38

    accuracy                           0.84        61
   macro avg       0.84      0.81      0.82        61
weighted avg       0.84      0.84      0.83        61

