# 6. Putting it all together

### Things to remember 
* All data should be numerical
* There should be no missing values
* Manipulate the test sets the same as the trainning set
* Never test on data on your trainned on
* Tune hyperparameters on validation set OR use cross-validation
* One best performance metric doesn't mean the best model

In [1]:
# Getting data ready 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV 

# Setup random seed
np.random.seed(42)

# Import data and drop rows with missing labels
data = pd.read_csv("../../data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], axis=0, inplace=True)

# Define different features and transformer pipeline
cat_features = ["Make","Colour"]
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

door_features = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=int(data["Doors"].mean()))),
    ("onehot", OneHotEncoder())
])

num_features = ["Odometer (KM)"]
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])
# Setup preprocessing steps (fill missing values, then covert them to number)
preprocessor = ColumnTransformer(transformers=[
    ("cat", cat_transformer, cat_features),
    ("door", door_transformer, door_features),
    ("num", num_transformer, num_features)
])

#Createing a processing and modelling pipline
model = Pipeline(steps=[
    ("preprocessor", preprocessor) ,
    ("model", RandomForestRegressor())
])

# Split data 
x = data.drop("Price", axis=1)
y = data["Price"]

# Split data into trainning and test set
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8)

#Fit and score model
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.21735623151692096

In [2]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('cat',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(fill_value='missing',
                                                                   strategy='constant')),
                                                    ('onehot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['Make', 'Colour']),
                                   ('door',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(fill_value=4,
                                                                   strategy='constant')),
                                                    ('onehot', OneHotEncoder())]),
                                    ['Doors']),
                                   ('num',


In [3]:
# User GridSearchCV and RandomizedSearchCV with Pipeline
pipe_grid = {
    "preprocessor__num__imputer__strategy" : ["mean", "median"], 
    'model__n_estimators': [50,100,200],
    'model__max_depth': [None,10],
    'model__max_features': ['auto'],    
    'model__min_samples_leaf': [1,2],
    'model__min_samples_split': [2,4],    
}

gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_le

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   1.1s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   1.1s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   1.1s
[CV] model__max_depth=None, model__max_features=auto, model__m

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model_

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.5s
[CV] model__max_depth=None, model__max_features=auto, model__min_sampl

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.2s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.2s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=median, total=   0.2s
[CV] model__max_depth=None, model__max_features=auto, model__min_sample

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.8s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.8s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.7s
[CV] model__max_depth=None, model__max_features=auto, model_

[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median, total=   0.4s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1,

[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=1, m

[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=median, total=   0.2s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=median, total=   0.2s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=50, preprocessor__num__imputer__strategy=median, total=   0.2s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf

[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.6s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=50, preprocessor__num__imputer__strategy=mean, total=   0.2s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=

[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=200, preprocessor__num__imputer__strategy=mean, total=   0.7s
[CV] model__max_depth=10, model__max_features=auto, model__min_samples_leaf=2, m

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  1.9min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
         

In [4]:
gs_model.best_params_

{'model__max_depth': 10,
 'model__max_features': 'auto',
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 2,
 'model__n_estimators': 50,
 'preprocessor__num__imputer__strategy': 'median'}

In [5]:
gs_model.score(x_test,y_test)

0.2948427538205114