### Importing

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

air_canada = pd.read_csv('../datasets/new_data.csv')

# seperate dataset into training and predicting
air_canada_to_predict = air_canada[air_canada['choice'] != air_canada['choice']]
air_canada = air_canada[air_canada['choice'] == air_canada['choice']]

X, y = air_canada.drop(['Unnamed: 0', 'id', 'ticket_id', 'choice'], axis=1), air_canada[['choice']]

y['choice'] = y['choice'].map({ 'nochoice': 0, 'pref': 1, 'advs': 2 })

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['choice'] = y['choice'].map({ 'nochoice': 0, 'pref': 1, 'advs': 2 })


In [17]:
X.sample(5)

Unnamed: 0,od,flight_departure_datetime,purchase_datetime,date_difference,trip_type,branded_fare,number_of_pax,ADVS_price,PREF_price,ADVS_capacity,PREF_capacity,ADVS_inventory,PREF_inventory
212187,BBBCCC,2024-10-27 10:00,2024-10-18 12:07,769980.0,1,1,2,47,115,250,50,236,37
180105,DDDEEE,2024-09-13 05:00,2024-08-31 10:07,1104780.0,0,1,3,54,91,110,30,103,15
129016,AAABBB,2024-07-02 23:00,2024-06-29 20:39,267660.0,0,0,1,58,63,250,50,197,1
136144,AAABBB,2024-07-12 23:00,2024-07-08 19:42,357480.0,0,1,1,37,79,250,50,185,0
242202,CCCDDD,2024-12-09 19:00,2024-12-08 15:42,98280.0,1,1,1,41,66,250,50,198,0


In [18]:
X.columns

Index(['od', 'flight_departure_datetime', 'purchase_datetime',
       'date_difference', 'trip_type', 'branded_fare', 'number_of_pax',
       'ADVS_price', 'PREF_price', 'ADVS_capacity', 'PREF_capacity',
       'ADVS_inventory', 'PREF_inventory'],
      dtype='object')

### Preprocessing

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [19]:
num_pipeline = make_pipeline(
  StandardScaler()
)

cat_pipeline = make_pipeline(
  OneHotEncoder()
)
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, ['od']),
    ('num', num_pipeline, ['date_difference', 'trip_type', 'branded_fare', 'number_of_pax', 'ADVS_price', 'PREF_price',
       'ADVS_capacity', 'PREF_capacity', 'ADVS_inventory', 'PREF_inventory'])
  ], remainder='drop')

In [20]:
X_train_prep = preprocessing.fit_transform(X_train)
X_test_prep = preprocessing.transform(X_test)

In [21]:
X_train_prep[0]

array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.33457694, -0.50955519,  0.99935705, -0.84349707, -0.21395759,
       -1.30594895,  0.62246236,  0.58914259,  0.78113592,  0.97334197])

### Training

In [22]:
from xgboost import XGBClassifier

bst = XGBClassifier(learning_rate=.3, objective='multi:softprob')
bst.fit(X_train_prep, y_train)


y_pred = bst.predict(X_test_prep)
accuracy_score(y_pred, y_test)

0.7511168831168831

### Tuning

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],           # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],         # Step size shrinkage
    'max_depth': [3, 5, 7],                    # Maximum depth of the trees
    'min_child_weight': [1, 3, 5],             # Minimum sum of instance weight needed in a child
    'subsample': [0.6, 0.8, 1.0],              # Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],       # Subsample ratio of columns when constructing each tree
    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction to make a further partition on a leaf node
}

rnd_search = GridSearchCV(bst,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_search.fit(X_train_prep, y_train)

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-0.853 total time=   6.7s
[CV 3/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-0.853 total time=   7.1s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6;, score=-0.855 total time=   8.4s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-0.855 total time=   8.1s
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-0.852 total time=   7.9s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsa

KeyboardInterrupt: 

### Predicting

In [23]:
X_to_predict, y_to_predict = air_canada_to_predict.drop(['id', 'ticket_id', 'choice'], axis=1), air_canada_to_predict[['id']]
X_to_predict_prep = preprocessing.transform(X_to_predict)
y_to_predict['choice'] = bst.predict(X_to_predict_prep)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_to_predict['choice'] = bst.predict(X_to_predict_prep)


In [24]:
y_to_predict['choice'] = y_to_predict['choice'].map({ 0: 'nochoice', 1: 'pref', 2: 'advs'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_to_predict['choice'] = y_to_predict['choice'].map({ 0: 'nochoice', 1: 'pref', 2: 'advs'})


In [25]:
y_final = pd.concat([y_to_predict, air_canada])[['id', 'choice']]

### Saving

In [27]:
pd.DataFrame.to_csv(y_final, '../predictions/pred2.csv', index=False)