### Importing

In [4]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
!pip install catboost
from catboost import CatBoostClassifier

air_canada = pd.read_csv('new_branded_data.csv')

# seperate dataset into training and predicting
air_canada_to_predict = air_canada[air_canada['choice'] != air_canada['choice']]
air_canada = air_canada[air_canada['choice'] == air_canada['choice']]

X, y = air_canada.drop(['Unnamed: 0', 'id', 'ticket_id', 'choice'], axis=1), air_canada[['choice']]

y['choice'] = y['choice'].map({ 'nochoice': 0, 'pref': 1, 'advs': 2 })

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['choice'] = y['choice'].map({ 'nochoice': 0, 'pref': 1, 'advs': 2 })


In [5]:
X.columns
X.head()

Unnamed: 0,od,flight_departure_datetime,purchase_datetime,time_diff,trip_type,branded_fare,number_of_pax,ADVS_price,PREF_price,ADVS_capacity,PREF_capacity,ADVS_inventory,PREF_inventory,pref_inv_full,advs_ratio,pref_ratio,time_day
2,BBBDDD,2024-01-01 03:00,2023-08-03 06:03,13035420.0,0,1,2,35,80,50,10,50,7,1,1.0,0.7,night
3,BBBDDD,2024-01-01 03:00,2023-08-13 17:41,12129540.0,0,3,1,0,0,50,10,50,5,1,1.0,0.5,night
4,BBBDDD,2024-01-01 03:00,2023-08-20 22:04,11508960.0,0,3,1,0,0,50,10,50,5,1,1.0,0.5,night
7,BBBDDD,2024-01-01 03:00,2023-09-17 22:16,9089040.0,1,3,1,0,0,50,10,48,4,1,0.96,0.4,night
8,BBBDDD,2024-01-01 03:00,2023-09-25 00:53,8474820.0,1,0,3,44,81,50,10,48,3,1,0.96,0.3,night


### Preprocessing

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [9]:
num_pipeline = make_pipeline(
  StandardScaler()
)

cat_pipeline = make_pipeline(
  OneHotEncoder()
)
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, ['od', 'trip_type', 'branded_fare']),
    ('num', num_pipeline, ['time_diff', 'number_of_pax', 'ADVS_price', 'PREF_price',
                           'ADVS_capacity', 'PREF_capacity', 'ADVS_inventory', 'PREF_inventory',
                           'pref_inv_full', 'advs_ratio', 'pref_ratio'])
  ], remainder='drop')

In [10]:
X_train_prep = preprocessing.fit_transform(X_train)
X_test_prep = preprocessing.transform(X_test)

array([[ 0.        ,  0.        ,  0.        , ..., -2.71051428,
        -1.33098964, -1.48024725],
       [ 0.        ,  1.        ,  0.        , ..., -2.71051428,
        -1.17418108, -1.48024725],
       [ 0.        ,  0.        ,  0.        , ...,  0.36893368,
         0.4697796 ,  1.02015016],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.36893368,
         0.61647147,  1.00091633],
       [ 0.        ,  1.        ,  0.        , ...,  0.36893368,
         0.34332108, -0.51855594],
       [ 1.        ,  0.        ,  0.        , ...,  0.36893368,
        -0.55200519, -0.49932211]])

### Training

In [15]:
import lightgbm as lgb


bst = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
bst.fit(X_train_prep, y_train)

y_pred = bst.predict(X_test_prep)

print("Accuracy:", accuracy_score(y_pred, y_test))
print("F1 Score:", f1_score(y_pred, y_test, average='weighted'))

0:	learn: 0.7309733	total: 117ms	remaining: 117ms
1:	learn: 0.6921469	total: 176ms	remaining: 0us
Accuracy: 0.7243116883116884
F1 Score: 0.7548180336349275


2

### Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    # 'n_estimators': [100, 200, 300],           # Number of boosting rounds
    'iterations': [800, 1000, 1200],
     'learning_rate': [0.01, 0.1, 0.2],         # Step size shrinkage
     "depth": [1, 5, 10],
    "subsample": [0.05, 0.5, 1.0],
    "colsample_bylevel": [0.05, 0.5, 1.0],
    "min_data_in_leaf": [1, 20, 80, 100],
    # 'max_depth': [-1, 3, 5],                    # Maximum depth of the trees
    # 'min_child_weight': [1, 3, 5],             # Minimum sum of instance weight needed in a child
    # 'subsample': [0.6, 0.8, 1.0],              # Subsample ratio of the training instances
 #   'colsample_bytree': [0.6, 0.8, 1.0],       # Subsample ratio of columns when constructing each tree
#    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction to make a further partition on a leaf node
    # 'max_bin': [255, 300, 500],
    # 'num_leaves': [30, 40, 50, 60],
}

rnd_search = GridSearchCV(bst,
    param_grid=param_grid, verbose=4, cv=2,  n_jobs=-1,
    scoring='f1_weighted')

rnd_search.fit(X_train_prep, y_train)

Fitting 2 folds for each of 972 candidates, totalling 1944 fits


In [None]:
rnd_search.best_params_

{'n_estimators': 300}

In [None]:
y_pred_best = rnd_search.best_estimator_.predict(X_test_prep)

f1_score(y_pred_best, y_test, average='weighted')



0.7821897958305488

### Predicting

In [None]:
X_to_predict, y_to_predict = air_canada_to_predict.drop(['id', 'ticket_id', 'choice'], axis=1), air_canada_to_predict[['id']]
X_to_predict_prep = preprocessing.transform(X_to_predict)
y_to_predict['choice'] = rnd_search.best_estimator_.predict(X_to_predict_prep)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_to_predict['choice'] = bst.predict(X_to_predict_prep)


In [None]:
y_to_predict['choice'] = y_to_predict['choice'].map({ 0: 'nochoice', 1: 'pref', 2: 'advs'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_to_predict['choice'] = y_to_predict['choice'].map({ 0: 'nochoice', 1: 'pref', 2: 'advs'})


In [None]:
y_final = pd.concat([y_to_predict, air_canada])[['id', 'choice']]

### Saving

In [None]:
pd.DataFrame.to_csv(y_final, '../predictions/pred8.csv', index=False)