In [1]:
# Import the necessary packages
import os

# For calculating
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, CatBoostClassifier

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
X_train = pd.read_csv("../data/X_train_resampled.csv")
y_train = pd.read_csv("../data/y_train_resampled.csv")
X_test = pd.read_csv("../data/X_test_resampled.csv")
y_test = pd.read_csv("../data/y_test_resampled.csv")

In [3]:
X_train.head()

Unnamed: 0,one_hot__ac_5K 343TQY,one_hot__ac_5K 345TFX,one_hot__ac_5M 343FOX,one_hot__ac_5M 343JAI,one_hot__ac_5M 343SUN,one_hot__ac_6P M87TRJ,one_hot__ac_BJ 320INA,one_hot__ac_BJ 320INB,one_hot__ac_BJ 320INC,one_hot__ac_BJ 320INH,...,one_hot__type_dep_medium_airport,one_hot__type_dep_small_airport,one_hot__type_arr_large_airport,one_hot__type_arr_medium_airport,one_hot__type_arr_small_airport,passthrough__is_same_country,passthrough__s_std_time_dec,passthrough__c_std_time_dec,passthrough__s_std_day_year,passthrough__c_std_day_year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,False,0.156434,-0.987688,0.746972,0.664855
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,False,-0.99863,0.052336,0.835925,-0.548843
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,False,-0.45399,-0.891007,0.47116,-0.882048
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,False,0.649448,-0.760406,0.46355,0.886071
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,False,0.5,-0.866025,-0.845249,-0.534373


In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138590 entries, 0 to 138589
Columns: 208 entries, one_hot__ac_5K 343TQY to passthrough__c_std_day_year
dtypes: bool(1), float64(207)
memory usage: 219.0 MB


### CatBoost with default parameters

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138590 entries, 0 to 138589
Columns: 208 entries, one_hot__ac_5K 343TQY to passthrough__c_std_day_year
dtypes: bool(1), float64(207)
memory usage: 219.0 MB


In [6]:
# Initialize the CatBoostClassifier with indices
model = CatBoostClassifier(learning_rate=0.2, l2_leaf_reg=2, depth=8, iterations=450)

# Fit the model
model.fit(X_train, y_train)

0:	learn: 1.5672953	total: 96.8ms	remaining: 43.5s
1:	learn: 1.5435004	total: 133ms	remaining: 29.9s
2:	learn: 1.5263592	total: 180ms	remaining: 26.9s
3:	learn: 1.5134608	total: 223ms	remaining: 24.9s
4:	learn: 1.4999603	total: 266ms	remaining: 23.7s
5:	learn: 1.4929139	total: 313ms	remaining: 23.2s
6:	learn: 1.4870545	total: 430ms	remaining: 27.2s
7:	learn: 1.4807263	total: 493ms	remaining: 27.2s
8:	learn: 1.4740240	total: 535ms	remaining: 26.2s
9:	learn: 1.4669103	total: 577ms	remaining: 25.4s
10:	learn: 1.4623093	total: 622ms	remaining: 24.8s
11:	learn: 1.4592315	total: 663ms	remaining: 24.2s
12:	learn: 1.4557407	total: 698ms	remaining: 23.5s
13:	learn: 1.4523295	total: 740ms	remaining: 23.1s
14:	learn: 1.4494009	total: 776ms	remaining: 22.5s
15:	learn: 1.4468941	total: 820ms	remaining: 22.2s
16:	learn: 1.4437158	total: 853ms	remaining: 21.7s
17:	learn: 1.4401928	total: 888ms	remaining: 21.3s
18:	learn: 1.4367343	total: 920ms	remaining: 20.9s
19:	learn: 1.4342459	total: 959ms	remain

<catboost.core.CatBoostClassifier at 0x165e7fb90>

In [7]:
# for y_test without dummies

# cat_features_indices = [X_train.columns.get_loc(col) for col in [
#     'airline', 'std_day', 'sta_day', 'std_month', 'sta_month',
#     'iso_country_dep', 'type_dep',
#     'iso_country_arr', 'type_arr', 'is_same_country',
# ]]

# # Initialize the CatBoostRegressor with indices
# model = CatBoostRegressor(cat_features=cat_features_indices)

# # Fit the model
# model.fit(X_train, y_train)

In [8]:
# Get feature importance
feature_importance = model.get_feature_importance()

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df.head(20))

                              Feature  Importance
207       passthrough__c_std_day_year   13.591151
206       passthrough__s_std_day_year   13.475733
204       passthrough__s_std_time_dec   13.421064
205       passthrough__c_std_time_dec    9.354899
198  one_hot__type_dep_medium_airport    2.228376
114       one_hot__iso_country_dep_FR    1.600337
197   one_hot__type_dep_large_airport    1.551881
63            one_hot__std_day_Sunday    1.491093
62          one_hot__std_day_Saturday    1.419651
200   one_hot__type_arr_large_airport    1.382274
194       one_hot__iso_country_arr_TN    1.365743
201  one_hot__type_arr_medium_airport    1.333214
60            one_hot__std_day_Friday    1.327959
145       one_hot__iso_country_dep_TN    1.279771
164       one_hot__iso_country_arr_FR    1.215863
64          one_hot__std_day_Thursday    1.196537
203      passthrough__is_same_country    1.130557
61            one_hot__std_day_Monday    1.073584
65           one_hot__std_day_Tuesday    0.983002


In [9]:
# Make predictions
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [10]:
# Evaluate on the test set
print("Test Set Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))


Test Set Results:
Accuracy: 0.42511742100768574

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.54      0.59      0.57      4858
           1       0.54      0.39      0.46      6904
           2       0.36      0.34      0.35      3741
           3       0.22      0.29      0.25      1798
           4       0.24      0.42      0.31      1435

    accuracy                           0.43     18736
   macro avg       0.38      0.41      0.38     18736
weighted avg       0.45      0.43      0.43     18736


Confusion Matrix (Test):
[[2863 1108  288  282  317]
 [1583 2708 1332  685  596]
 [ 416  787 1261  698  579]
 [ 210  257  392  528  411]
 [ 198  136  234  262  605]]


In [11]:
# Evaluate on the training set
print("\nTraining Set Results:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("\nClassification Report (Train):")
print(classification_report(y_train, y_pred_train))
print("\nConfusion Matrix (Train):")
print(confusion_matrix(y_train, y_pred_train))


Training Set Results:
Accuracy: 0.6388484017605888

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.64      0.69      0.66     27718
           1       0.53      0.48      0.51     27718
           2       0.61      0.55      0.58     27718
           3       0.69      0.67      0.68     27718
           4       0.71      0.80      0.75     27718

    accuracy                           0.64    138590
   macro avg       0.63      0.64      0.64    138590
weighted avg       0.63      0.64      0.64    138590


Confusion Matrix (Train):
[[19134  4564  1546  1227  1247]
 [ 5295 13420  4264  2427  2312]
 [ 2325  3910 15298  3182  3003]
 [ 1880  2051  2595 18506  2686]
 [ 1409  1161  1471  1497 22180]]


## with hyperparameter search

In [12]:
# # Initialize the CatBoostClassifier
# model_hp = CatBoostClassifier(verbose=0, random_state=42)

# # Define the parameter grid for random search
# param_grid = {
#     'iterations': np.arange(100, 500, 50),  # Number of boosting iterations
#     'learning_rate': np.linspace(0.01, 0.3, 10),  # Learning rate
#     'depth': np.arange(4, 10),  # Depth of the trees
#     'l2_leaf_reg': np.arange(1, 10),  # L2 regularization coefficient
# }

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=model_hp,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='accuracy',  # Use accuracy as the scoring metric
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1  # Use all available cores for parallel computation
# )

In [13]:
# # Fit RandomizedSearchCV on the training data
# random_search.fit(X_train, y_train)

# # Display the best parameters and best score
# print("Best Parameters:", random_search.best_params_)
# print("Best Accuracy Score:", random_search.best_score_)

In [14]:
# # Use the best model for predictions
# best_model = random_search.best_estimator_

In [15]:
# # Get feature importance
# feature_importance = best_model.get_feature_importance()

# # Create a DataFrame for better visualization
# feature_importance_df = pd.DataFrame({
#     'Feature': X_train.columns,
#     'Importance': feature_importance
# })

# # Sort the DataFrame by importance
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# # Display the feature importance
# print(feature_importance_df.head(20))

In [16]:
# # Make predictions
# y_pred_test = best_model.predict(X_test)
# y_pred_train = best_model.predict(X_train)

In [17]:
# # Evaluate on the test set
# print("Test Set Results:")
# print("Accuracy:", accuracy_score(y_test, y_pred_test))
# print("\nClassification Report (Test):")
# print(classification_report(y_test, y_pred_test))
# print("\nConfusion Matrix (Test):")
# print(confusion_matrix(y_test, y_pred_test))

In [18]:
# # Evaluate on the training set
# print("\nTraining Set Results:")
# print("Accuracy:", accuracy_score(y_train, y_pred_train))
# print("\nClassification Report (Train):")
# print(classification_report(y_train, y_pred_train))
# print("\nConfusion Matrix (Train):")
# print(confusion_matrix(y_train, y_pred_train))