### Importing

In [31]:
from sklearn.metrics import root_mean_squared_error
import numpy as np
import pandas as pd

air_canada = pd.read_csv('../datasets/participant_data.csv')

# seperate dataset into training and predicting
air_canada_to_predict = air_canada[air_canada['choice'] != air_canada['choice']]
air_canada = air_canada[air_canada['choice'] == air_canada['choice']]

X, y = air_canada.drop(['Date', 'Country', 'Commodity Type', 'Commodity Category', 'Units'], axis=1), cn[['Units']]

In [34]:
air_canada.columns

Index(['id', 'ticket_id', 'od', 'flight_departure_datetime',
       'purchase_datetime', 'trip_type', 'branded_fare', 'number_of_pax',
       'ADVS_price', 'PREF_price', 'ADVS_capacity', 'PREF_capacity',
       'ADVS_inventory', 'PREF_inventory', 'choice'],
      dtype='object')

### Training

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(
    RandomForestRegressor(random_state=42)
)
forest_reg.fit(X_train, y_train)

y_pred_svm = forest_reg.predict(X_val)

root_mean_squared_error(y_pred_svm, y_val)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor

trees_reg = make_pipeline(
    ExtraTreesRegressor(random_state=42)
)
trees_reg.fit(X_train, y_train)

y_pred_svm = trees_reg.predict(X_val)

root_mean_squared_error(y_pred_svm, y_val)

### Tuning

#### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300],          # Number of trees
    'randomforestregressor__max_depth': [None, 10, 20, 30],          # Maximum depth of each tree
    'randomforestregressor__min_samples_split': [2, 5, 10],          # Minimum number of samples required to split a node
    'randomforestregressor__min_samples_leaf': [1, 2, 4],            # Minimum number of samples required at each leaf node
    'randomforestregressor__bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

rnd_forest_search = GridSearchCV(forest_reg,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_forest_search.fit(X_train, y_train)

In [None]:
rnd_forest_search.best_params_

In [None]:
y_pred_best = rnd_forest_search.best_estimator_.predict(X_val)

root_mean_squared_error(y_pred_best, y_val)

#### Extra Trees

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'extratreesregressor__n_estimators': [100, 200, 300],          # Number of trees
    'extratreesregressor__max_depth': [None, 10, 20, 30],          # Maximum depth of each tree
    'extratreesregressor__min_samples_split': [2, 5, 10],          # Minimum number of samples required to split a node
    'extratreesregressor__min_samples_leaf': [1, 2, 4],            # Minimum number of samples required at each leaf node
    'extratreesregressor__bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

rnd_trees_search = GridSearchCV(trees_reg,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_trees_search.fit(X_train, y_train)

In [None]:
rnd_trees_search.best_params_

In [None]:
y_pred_best = rnd_trees_search.best_estimator_.predict(X_val)

root_mean_squared_error(y_pred_best, y_val)

### Saving

In [None]:
import joblib

joblib.dump(rnd_forest_search.best_estimator_, "./estimators/forest.pkl")
joblib.dump(rnd_trees_search.best_estimator_, "./estimators/trees.pkl")