## Checkpoints Analysis on Scikit Learn using Heart Disease Dataset

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import seaborn as sbn  
import pickle

In [3]:
# Importing the data
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
# Import the Random Forest estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(23)

# Make the data
X = heart_disease.drop("target", axis = 1)
y= heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate Random Forest Classifier
# clf = RandomForestClassifier(n_estimators=144, bootstrap=True, max_samples=0.80)
clf = RandomForestClassifier(n_estimators = 110,  bootstrap=True, max_samples=0.80)

# Fit the model to the data (training the machine learning model)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learned)
clf.score(X_test, y_test)

0.9902439024390244

Let us try using the approach from data science stach exchange where we set warm_start = `True`

In [5]:
# Setup random seed
# https://datascience.stackexchange.com/questions/49012/checkpoints-in-sklearn
# https://stackoverflow.com/questions/42757892/how-to-use-warm-start

np.random.seed(23)

# Separate the data
X = heart_disease.drop("target", axis = 1)
y= heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators = 110,  bootstrap=True, max_samples=0.80, warm_start=True)
number_of_checkpoints = 10

for checkpoint in range(number_of_checkpoints):
    clf.n_estimators += checkpoint * 10
    clf.fit(X_train, y_train) 
    
     # Save model checkpoint for each fit
    with open('saved_models/random-forest/random_forest_ckp_{}.pkl'.format(checkpoint), 'wb') as f:
        pickle.dump(clf, f)


In [6]:
#Load dump
# Import back the exported model and test for accuracy just to double check
loaded_model = pickle.load(open("saved_models/random-forest/random_forest_ckp_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.9902439024390244

In [7]:
loaded_model = pickle.load(open("saved_models/random-forest/random_forest_ckp_9.pkl", "rb"))
loaded_model.score(X_test, y_test)

1.0

By adding more trees to the already fitten random forest estimator, we can see a change in the accuracy of our model.