### Importing

In [43]:
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

air_canada = pd.read_csv('../datasets/participant_data.csv')

# seperate dataset into training and predicting
air_canada_to_predict = air_canada[air_canada['choice'] != air_canada['choice']]
air_canada = air_canada[air_canada['choice'] == air_canada['choice']]

X, y = air_canada.drop(['id', 'ticket_id', 'choice'], axis=1), air_canada[['choice']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [44]:
air_canada.sample(5)

Unnamed: 0,id,ticket_id,od,flight_departure_datetime,purchase_datetime,trip_type,branded_fare,number_of_pax,ADVS_price,PREF_price,ADVS_capacity,PREF_capacity,ADVS_inventory,PREF_inventory,choice
197228,197228,2956954957234691,CCCDDD,2024-10-06 19:00,2024-07-17 13:14,0,0,2,27,66,250,50,243,38,nochoice
40609,40609,2300724984576949,AAABBB,2024-02-27 23:00,2023-12-14 15:21,1,1,2,47,76,250,50,245,43,pref
82786,82786,6969648397409098,AAABBB,2024-04-27 23:00,2024-03-13 08:44,0,2,2,47,78,250,50,238,26,nochoice
51881,51881,230731469138725,CCCEEE,2024-03-15 07:00,2024-01-09 15:35,0,2,1,40,79,250,50,230,33,pref
135922,135922,6189996093029142,CCCDDD,2024-07-12 19:00,2024-07-11 20:57,0,2,2,29,77,250,50,198,0,advs


In [45]:
X.columns

Index(['od', 'flight_departure_datetime', 'purchase_datetime', 'trip_type',
       'branded_fare', 'number_of_pax', 'ADVS_price', 'PREF_price',
       'ADVS_capacity', 'PREF_capacity', 'ADVS_inventory', 'PREF_inventory'],
      dtype='object')

### Preprocessing

In [47]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [48]:
num_pipeline = make_pipeline(
  StandardScaler()
)

cat_pipeline = make_pipeline(
  OneHotEncoder()
)
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, ['od']),
    ('num', num_pipeline, ['trip_type', 'branded_fare', 'number_of_pax', 'ADVS_price', 'PREF_price',
       'ADVS_capacity', 'PREF_capacity', 'ADVS_inventory', 'PREF_inventory'])
  ], remainder='drop')

In [50]:
X_train_prep = preprocessing.fit_transform(X_train)
X_test_prep = preprocessing.transform(X_test)

### Training

In [49]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(
    RandomForestRegressor(random_state=42)
)
forest_reg.fit(X_train, y_train)

y_pred_svm = forest_reg.predict(X_val)

root_mean_squared_error(y_pred_svm, y_val)

ValueError: could not convert string to float: 'CCCEEE'

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor

trees_reg = make_pipeline(
    ExtraTreesRegressor(random_state=42)
)
trees_reg.fit(X_train, y_train)

y_pred_svm = trees_reg.predict(X_val)

root_mean_squared_error(y_pred_svm, y_val)

### Tuning

#### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300],          # Number of trees
    'randomforestregressor__max_depth': [None, 10, 20, 30],          # Maximum depth of each tree
    'randomforestregressor__min_samples_split': [2, 5, 10],          # Minimum number of samples required to split a node
    'randomforestregressor__min_samples_leaf': [1, 2, 4],            # Minimum number of samples required at each leaf node
    'randomforestregressor__bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

rnd_forest_search = GridSearchCV(forest_reg,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_forest_search.fit(X_train, y_train)

In [None]:
rnd_forest_search.best_params_

In [None]:
y_pred_best = rnd_forest_search.best_estimator_.predict(X_val)

root_mean_squared_error(y_pred_best, y_val)

#### Extra Trees

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'extratreesregressor__n_estimators': [100, 200, 300],          # Number of trees
    'extratreesregressor__max_depth': [None, 10, 20, 30],          # Maximum depth of each tree
    'extratreesregressor__min_samples_split': [2, 5, 10],          # Minimum number of samples required to split a node
    'extratreesregressor__min_samples_leaf': [1, 2, 4],            # Minimum number of samples required at each leaf node
    'extratreesregressor__bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

rnd_trees_search = GridSearchCV(trees_reg,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_trees_search.fit(X_train, y_train)

In [None]:
rnd_trees_search.best_params_

In [None]:
y_pred_best = rnd_trees_search.best_estimator_.predict(X_val)

root_mean_squared_error(y_pred_best, y_val)

### Saving

In [None]:
import joblib

joblib.dump(rnd_forest_search.best_estimator_, "./estimators/forest.pkl")
joblib.dump(rnd_trees_search.best_estimator_, "./estimators/trees.pkl")