# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

In [3]:
#reduce the data for the first test the the computing ist faster
X=X.head(1000)
y=y.head(1000)

print(X.shape)
print(y.shape)

(1000, 31)
(1000,)


### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [4]:
# install 
!pip install scikit-optimize



### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [5]:
# Imports
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [6]:
#first split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)

In [7]:
lin_param = {
              'model' : Categorical([LinearRegression()]),
             }

#Random Forest Param
rf_param ={
                'model': Categorical([RandomForestRegressor()]),
                'model__n_estimators': Integer(10,200, prior='log-uniform'),
                'model__max_depth': Integer(2,20,prior='log-uniform')
                }

# SVR
svr_param = {
            'model': Categorical([SVR()]),
            'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
            'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
            }
# Create pipeline
pipe = Pipeline([('model', SVR())])

# Baysian optimization
opt = BayesSearchCV(
                    pipe,
                    [(svr_param, 10), (rf_param, 10), (lin_param,1)],
                    cv=5,
                    n_iter=32,
                    random_state=42,
                    n_jobs=6, #parallelize
)

In [8]:
%%time
opt.fit(X_train,y_train)

Wall time: 8min 29s


BayesSearchCV(cv=5, estimator=Pipeline(steps=[('model', SVR())]), n_iter=32,
              n_jobs=6, random_state=42,
              search_spaces=[({'model': Categorical(categories=(SVR(),), prior=None),
                               'model__C': Real(low=1e-06, high=1000000.0, prior='log-uniform', transform='identity'),
                               'model__gamma': Real(low=1e-06, high=10.0, prior='log-uniform', transform='identity')},
                              10),
                             ({'model': Categorical(categories=(RandomForestRegressor(),), prior=None),
                               'model__max_depth': Integer(low=2, high=20, prior='log-uniform', transform='identity'),
                               'model__n_estimators': Integer(low=10, high=200, prior='log-uniform', transform='identity')},
                              10),
                             ({'model': Categorical(categories=(LinearRegression(),), prior=None)},
                              1)])

In [9]:
import pandas as pd
search_results = pd.DataFrame(opt.cv_results_,dtype='float', columns = ['split0_test_score','split1_test_score','split2_test_score','split3_test_score',
                                                                      'mean_test_score','rank_test_score'])
search_results

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,rank_test_score
0,0.703547,0.896362,0.744551,0.803667,0.735431,2.0
1,-0.121133,-0.084757,-0.070136,-0.113842,-0.104293,20.0
2,0.082603,0.108616,0.09627,0.086913,0.074529,17.0
3,0.708471,0.899009,0.753089,0.792177,0.735227,3.0
4,0.706992,0.904139,0.745177,0.789624,0.739524,1.0
5,0.307722,0.147218,0.265837,0.21918,0.208591,15.0
6,-0.001784,-0.000632,-0.008683,0.000249,-0.003531,18.0
7,0.144319,0.137822,0.16608,0.164499,0.15336,16.0
8,-0.098351,-0.062924,-0.050021,-0.092691,-0.084028,19.0
9,-0.121431,-0.084718,-0.070069,-0.114137,-0.104433,21.0


In [10]:
search_param_svr = pd.DataFrame(opt.cv_results_,dtype='float', columns = ['param_model__C','param_model__gamma'])
search_param_svr

Unnamed: 0,param_model__C,param_model__gamma
0,105844.978743,5e-06
1,0.000115,0.015358
2,0.180447,0.004857
3,187256.478217,1.4e-05
4,2015.452401,0.000326
5,109675.776342,0.002987
6,255.539085,1.263466
7,6515.687336,0.154518
8,0.020034,0.002544
9,0.004865,5.265766


In [11]:
search_param_rf = pd.DataFrame(opt.cv_results_,dtype='float', columns = ['param_model__n_estimators','param_model__max_depth'])
search_param_rf

Unnamed: 0,param_model__n_estimators,param_model__max_depth
0,115.0,8.0
1,137.0,6.0
2,133.0,7.0
3,18.0,19.0
4,196.0,6.0
5,16.0,13.0
6,63.0,6.0
7,137.0,6.0
8,166.0,5.0
9,64.0,8.0


In [12]:
opt.best_params_

OrderedDict([('model', SVR(C=2015.4524013260261, gamma=0.0003259571586662193)),
             ('model__C', 2015.4524013260261),
             ('model__gamma', 0.0003259571586662193)])

In [13]:
#check score for the test Data
opt.score(X_test,y_test)

0.7721152836921392