In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn import metrics

In [None]:
df_preprocessed = pd.read_csv('..\df_preprocessed_2015-2019.csv')

In [None]:
import re
df1 = df1.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) 

In [None]:
#exclude the object datatype
df1 = df1.select_dtypes(exclude=['object'])
#drop the unusual taxi of 6660 minutes
df1 = df1.drop(df1[df1.TO == 39360].index)

In [None]:
#split into train and test
train=df1[df1['aobt_year'] != 2019]
test=df1[df1['aobt_year'] == 2019]
x_train=train.loc[:, train.columns != 'TO']
x_test=test.loc[:, test.columns != 'TO']
y_train =train[['TO']]
y_test =test[['TO']]

In [None]:
#Running grid search to obtain the best parameters
model = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=20,
                            min_samples_leaf=35)


#Gradient Boosting with simple grid
#Only three parameters tuned, all others kept as default
grid = ParameterGrid({'n_estimators':[50,25,125], 'max_features': [10,25,30],'min_samples_split':  [5,25,50], 'min_samples_leaf':[5,20,50]
           })
    
for parameters in grid:
    regressor=ExtraTreesRegressor(**parameters)
    model=regressor.fit(x_train, y_train.values.ravel())
    y_pred = model.predict(x_test)
    
    #compute MSE
    mse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    print("MSE: %.2f" % mse)
    
    #Show tuned parameters for each iteration (3*3*2 models)
    tuned_parameters=model.get_params
    print(tuned_parameters)