# Scikit Learn Worklow Overview

1. Prepare data
2. Choose correct algorithm
3. Use algorithm to create model to fit training data
4. Evaluate model
5. Improve model
6. Save and load trained model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1. Prepare Data

In [2]:
heart_disease = pd.read_csv("heart_disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# X = features matrix
X = heart_disease.drop("target", axis=1)

# Y = labels matrix
y = heart_disease["target"]

### 2. Choose Algorithm

In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 3. Fit Model to Data

In [9]:
# Use 80% of data for training, 20% for testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
clf.fit(X_train, y_train);

In [15]:
y_preds = clf.predict(X_test)
y_preds

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0])

### 4. Evaluate Model

In [16]:
clf.score(X_train, y_train)

1.0

In [17]:
clf.score(X_test, y_test)

0.8688524590163934

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.81      0.85        27
           1       0.86      0.91      0.89        34

    accuracy                           0.87        61
   macro avg       0.87      0.86      0.87        61
weighted avg       0.87      0.87      0.87        61



In [21]:
confusion_matrix(y_test, y_preds)

array([[22,  5],
       [ 3, 31]])

In [22]:
accuracy_score(y_test, y_preds)

0.8688524590163934

### 5. Improve Model

In [23]:
# Change number of n_estimators
np.random.seed(50)

for i in range(10,100,10):
    print(f"Trying model with {i} estimators:")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy = {clf.score(X_test, y_test) * 100:.2f}")
    print("\n")

Trying model with 10 estimators:
Model accuracy = 85.25


Trying model with 20 estimators:
Model accuracy = 90.16


Trying model with 30 estimators:
Model accuracy = 85.25


Trying model with 40 estimators:
Model accuracy = 88.52


Trying model with 50 estimators:
Model accuracy = 88.52


Trying model with 60 estimators:
Model accuracy = 85.25


Trying model with 70 estimators:
Model accuracy = 85.25


Trying model with 80 estimators:
Model accuracy = 85.25


Trying model with 90 estimators:
Model accuracy = 88.52




### Save and Load Model

In [28]:
import pickle

# Save model
pickle.dump(clf, open("random_forest_model_1.pk1", "wb"))

# Load model
loaded_model = pickle.load(open("random_forest_model_1.pk1", "rb"))

In [29]:
loaded_model.score(X_test, y_test)

0.8852459016393442