In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [41]:
heart_disease = pd.read_csv("heart-disease.csv")

from sklearn.datasets import load_boston

boston_df = pd.DataFrame(load_boston()["data"], columns=load_boston()["feature_names"])
boston_df["target"] = load_boston()["target"]

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_classification(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    print(f"Accuracy: {accuracy * 100 :.2f} %")
    print(f"Precision: {precision :.2f}")
    print(f"Recall: {recall :.2f}")
    print(f"F1: {f1 :.2f}")
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f2": f1}


# Improving a Model

First predictions = baseline predictions
First model = baseline model

### Things to consider:
- Could we collect more data?
- Could we improve the data?
- Is there a better model we could use?
- Could we improve the current model?

In [43]:
model = RandomForestClassifier()

model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Ways to adjust hyperparameter:

1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV

## Tuning hyperparameters by hand

To adjust the hyperparameters you need to have three sets:

1. Train
2. Validate
3. Test

We are going to change the following hyperparameters:

- `max_depth`
- `max_features`
- `min_samples_leaf`
- `min_samples_split`
- `n_estimators`

In [44]:
# we are going to manually split the data

np.random.seed(22)

# shuffle data
heart_disease_shuffled = heart_disease.sample(frac=1)

# split to X and Y
X = heart_disease_shuffled.drop("target", axis=1)
Y = heart_disease_shuffled["target"]

# split to train, validate, test
train = round(0.7 * len(heart_disease_shuffled.index))
validate = round(train + 0.15 * len(heart_disease_shuffled.index))

X_TRAIN, Y_TRAIN = X[:train], Y[:train]
X_VALID, Y_VALID = X[train:validate], Y[train:validate]
X_TEST, Y_TEST = X[validate:], Y[validate:]

In [64]:
# baseline run
np.random.seed(22)

model = RandomForestClassifier()

model.fit(X_TRAIN, Y_TRAIN)
y_pred = model.predict(X_VALID)

baseline_metrics = evaluate_classification(Y_VALID, y_pred)

Accuracy: 77.78 %
Precision: 0.77
Recall: 0.77
F1: 0.77


In [68]:
# now lets adjust the hyper paramets on a new classifiers and comare it to our old one
np.random.seed(22)

model_2 = RandomForestClassifier(n_estimators=140, max_depth=30)

model_2.fit(X_TRAIN, Y_TRAIN)
y_pred = model_2.predict(X_VALID)

evaluate_classification(Y_VALID, y_pred)

Accuracy: 82.22 %
Precision: 0.82
Recall: 0.82
F1: 0.82


{'accuracy': 0.8222222222222222,
 'precision': 0.8181818181818182,
 'recall': 0.8181818181818182,
 'f2': 0.8181818181818182}

## Tuning hyperparameters using RandomizedSearchCV

In [93]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10 ,20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

np.random.seed(22)

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.2)

model = RandomForestClassifier(n_jobs=-1)

# setting up RandomizedSearchCV
rs_model = RandomizedSearchCV(estimator=model,
                              param_distributions=grid,
                              n_iter=10,
                              cv=5,
                              verbose=2)
rs_model.fit(X_TRAIN, Y_TRAIN)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   3.7s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   1.5s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   1.5s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   1.6s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, total=   1.5s
[CV] n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=10, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=20, total=   0.0s
[CV] n_estimators=10, min_samples_split=6, min_

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   31.6s finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'max_depth': [None, 5, 10, 20, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [10, 100, 200, 500,
                                                         1000, 1200]},
                   verbose=2)

In [94]:
rs_model.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10}

In [95]:
y_preds = rs_model.predict(X_TEST)

rs_model_eval = evaluate_classification(Y_TEST, y_preds)

Accuracy: 70.49 %
Precision: 0.78
Recall: 0.74
F1: 0.76


## Hyperparameter tuning with GridSearchCV

GridSearchCV will go through all combinations

In [96]:
grid

{'n_estimators': [10, 100, 200, 500, 1000, 1200],
 'max_depth': [None, 5, 10, 20, 30],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [2, 4, 6],
 'min_samples_leaf': [1, 2, 4]}

In [97]:
grid_2 = {
    'n_estimators': [100, 200, 300],
     'max_depth': [5, 10, 15],
     'max_features': ['auto', 'sqrt'],
     'min_samples_split': [2, 3, 4],
     'min_samples_leaf': [2, 3,4]
        }

In [99]:
from sklearn.model_selection import GridSearchCV

np.random.seed(22)

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X, Y, test_size=0.2)

model = RandomForestClassifier(n_jobs=-1)

gs_model = GridSearchCV(model, grid_2, cv=5, verbose=2)

gs_model.fit(X_TRAIN, Y_TRAIN)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   2.3s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.2s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s



[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100, total=   0.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200, total=   0.3s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split

[Parallel(n_jobs=1)]: Done 810 out of 810 | elapsed:  5.1min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'max_depth': [5, 10, 15],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [100, 200, 300]},
             verbose=2)

In [100]:
gs_model.best_params_

{'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 4,
 'n_estimators': 200}

In [102]:
gs_model_preds = gs_model.predict(X_TEST)

gs_model_eval = evaluate_classification(Y_TEST, gs_model_preds)

Accuracy: 72.13 %
Precision: 0.78
Recall: 0.79
F1: 0.78


# Saving a Model

Two ways to save and load a model:

1. `pickle`
2. `joblib`

## Using `pickle`

In [103]:
import pickle

pickle.dump(gs_model, open("gs_model_heart_disease.pkl", "wb"))

In [104]:
loaded_gs_model = pickle.load(open("gs_model_heart_disease.pkl", "rb"))

In [120]:
loaded_gs_model.score(X_TEST, Y_TEST)

0.7213114754098361

In [115]:
Y[4:5]

41    1
Name: target, dtype: int64

## Using `joblib`


In [116]:
from joblib import dump, load

dump(gs_model, filename="gs_model_heart_disease.joblib")

['gs_model_heart_disease.joblib']

In [118]:
loaded_gs_model = load("gs_model_heart_disease.joblib")

In [121]:
loaded_gs_model.score(X_TEST, Y_TEST)

0.7213114754098361