# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium
from sklearn.model_selection import train_test_split

In [50]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [5]:
# install 
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.8.1-py2.py3-none-any.whl (101 kB)
Collecting pyaml>=16.9
  Downloading pyaml-20.4.0-py2.py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-20.4.0 scikit-optimize-0.8.1


You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [23]:
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error


In [51]:
reg = LinearRegression().fit(X_train, y_train)


In [52]:
print('MSE Train:', mean_squared_error(y_train, reg.predict(X_train)))
print('MSE Test: ', mean_squared_error(y_test, reg.predict(X_test)))

MSE Train: 27.30084147750869
MSE Test:  23.07074545478391


In [6]:
opt_RF = BayesSearchCV(
        RandomForestRegressor(),
        {
            'n_estimators': Integer(10,50),
            'max_depth': Integer(1,10)
        },
        n_iter=10,
        random_state=0,
        n_jobs=10
    )
opt_RF.fit(X_train, y_train)

BayesSearchCV(estimator=RandomForestRegressor(), n_iter=10, n_jobs=10,
              random_state=0,
              search_spaces={'max_depth': Integer(low=1, high=10, prior='uniform', transform='identity'),
                             'n_estimators': Integer(low=10, high=50, prior='uniform', transform='identity')})

In [7]:
opt_RF.best_params_

OrderedDict([('max_depth', 9), ('n_estimators', 50)])

In [24]:
print('MSE Train:', mean_squared_error(y_train, opt_RF.predict(X_train)))
print('MSE Test: ', mean_squared_error(y_test, opt_RF.predict(X_test)))

MSE Train: 12.87630274116533
MSE Test:  20.351363577955834


In [45]:
opt_SVM = BayesSearchCV(
        make_pipeline(
            StandardScaler(),
            SVR(max_iter=500)
        ),
        {
            'svr__C': Real(0.1, 10.0), 
            'svr__epsilon':  Real(0.1, 1.0)
        },
        n_iter=10,
        random_state=0,
        n_jobs=10
    )
opt_SVM.fit(X_train, y_train)



BayesSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                        ('svr', SVR(max_iter=500))]),
              n_iter=10, n_jobs=10, random_state=0,
              search_spaces={'svr__C': Real(low=0.1, high=10.0, prior='uniform', transform='identity'),
                             'svr__epsilon': Real(low=0.1, high=1.0, prior='uniform', transform='identity')})

In [46]:
opt_SVM.best_params_

OrderedDict([('svr__C', 5.356545233026259),
             ('svr__epsilon', 0.7084715579892222)])

In [47]:
print('MSE Train:', mean_squared_error(y_train, opt_SVM.predict(X_train)))
print('MSE Test: ', mean_squared_error(y_test, opt_SVM.predict(X_test)))

MSE Train: 75.39614633544002
MSE Test:  73.55479841230316
