# intro to sklearn

we will cover

0. high level workflow
1. get the data read
2. choose the right estimator for our problem
3. fit the model
4. evaluate the mode
5. improve the mode
6. save and load
7. put all together

## 0. and e2e workflow

In [42]:
# standard import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
# 1. get data ready
df = pd.read_csv('data/heart-disease.csv')
df.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1


In [24]:
# create X (feature matrix)
X = df.drop(columns=['target'])

# create y (label)
y = df.target

In [25]:
# 2. choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# keep the default parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [26]:
# 3. fit the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [27]:
clf.fit(X_train, y_train)

In [28]:
# make a prediction
y_preds = clf.predict(X_test)
y_preds

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0])

In [29]:
# 4. evaluate the model
clf.score(X_train, y_train)

1.0

In [30]:
clf.score(X_test, y_test)

0.8289473684210527

In [31]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.85      0.78      0.82        37
           1       0.81      0.87      0.84        39

    accuracy                           0.83        76
   macro avg       0.83      0.83      0.83        76
weighted avg       0.83      0.83      0.83        76



In [32]:
confusion_matrix(y_test, y_preds)

array([[29,  8],
       [ 5, 34]])

In [33]:
accuracy_score(y_test, y_preds)

0.8289473684210527

In [36]:
# 5. improve a model
# try different n_estimators
np.random.seed(42)

for i in range(10, 100, 10):
    print(f'try model with {i} estimators')
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(f'accuracy is {score * 100:.2f}%')
    print('')

try model with 10 estimators
accuracy is 77.63%

try model with 20 estimators
accuracy is 82.89%

try model with 30 estimators
accuracy is 81.58%

try model with 40 estimators
accuracy is 82.89%

try model with 50 estimators
accuracy is 82.89%

try model with 60 estimators
accuracy is 84.21%

try model with 70 estimators
accuracy is 81.58%

try model with 80 estimators
accuracy is 86.84%

try model with 90 estimators
accuracy is 82.89%



In [37]:
# 6. save the model and load it
import pickle

pickle.dump(clf, open('models/random_forest_1.pkl', 'wb'))

In [39]:
loaded_model = pickle.load(open('models/random_forest_1.pkl', 'rb'))

In [40]:
loaded_model.score(X_test, y_test)

0.8289473684210527

In [41]:
# ignore warnings if you are sure
import warnings
warnings.filterwarnings('ignore')

## putting it all together!!

In [44]:
data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0


In [47]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [49]:
# count missing data
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

steps we want to take (all in one cell)
1. fill missing data
2. convert obj data into numbers
3. build a model on the data

In [51]:
# get data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# setup random seed
np.random.seed(42)

# import data and drop rows with missing label
data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'], inplace=True)

# define different features and transformer pipeline
cat_feats = ['Make', 'Colour']
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

door_feats = ['Doors']
door_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4)),
])

num_feats = ['Odometer (KM)']
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# setup preprossing steps (fill missing values and convert to numbers)
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_trans, cat_feats),
    ('door', door_trans, door_feats),
    ('num', num_trans, num_feats)
])

# creating a preprocessing and modelling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# split data
X = data.drop(columns=['Price'])
y = data.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

it's also possible to use GridSearchCV or RandomizedSearchCV with our pipeline

In [52]:
# use Grid search CV
pipe_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    "model__max_features": ['auto'],
    "model__min_samples_split": [2,4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_sampl

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   1.7s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   1.9s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strat

In [53]:
gs_model.score(X_test, y_test)

0.3339554263158365