In [1]:
# Import the necessary packages
import os

# For calculating
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, CatBoostClassifier

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
X_train = pd.read_csv("../data/X_train_resampled.csv")
y_train = pd.read_csv("../data/y_train_resampled.csv")
X_test = pd.read_csv("../data/X_test_resampled.csv")
y_test = pd.read_csv("../data/y_test_resampled.csv")

In [3]:
X_train.head()

Unnamed: 0,one_hot__airline_5K,one_hot__airline_5M,one_hot__airline_6P,one_hot__airline_BJ,one_hot__airline_D4,one_hot__airline_GJ,one_hot__airline_GW,one_hot__airline_OL,one_hot__airline_PS,one_hot__airline_QS,...,one_hot__iso_country_arr_TR,one_hot__iso_country_arr_UA,one_hot__type_dep_large_airport,one_hot__type_dep_medium_airport,one_hot__type_dep_small_airport,one_hot__type_arr_large_airport,one_hot__type_arr_medium_airport,one_hot__type_arr_small_airport,passthrough__is_same_country,passthrough__std_time_dec
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,False,11.4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,False,18.2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,False,13.8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False,9.3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,False,10.0


In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138590 entries, 0 to 138589
Columns: 159 entries, one_hot__airline_5K to passthrough__std_time_dec
dtypes: bool(1), float64(158)
memory usage: 167.2 MB


### CatBoost with default parameters

In [5]:
# Initialize the CatBoostClassifier with indices
model = CatBoostClassifier()

# Fit the model
model.fit(X_train, y_train)

Learning rate set to 0.101754
0:	learn: 1.5922897	total: 95.4ms	remaining: 1m 35s
1:	learn: 1.5787452	total: 121ms	remaining: 1m
2:	learn: 1.5684694	total: 147ms	remaining: 48.7s
3:	learn: 1.5600740	total: 165ms	remaining: 41.1s
4:	learn: 1.5527720	total: 182ms	remaining: 36.2s
5:	learn: 1.5463900	total: 207ms	remaining: 34.3s
6:	learn: 1.5408846	total: 227ms	remaining: 32.2s
7:	learn: 1.5362080	total: 247ms	remaining: 30.7s
8:	learn: 1.5325053	total: 268ms	remaining: 29.5s
9:	learn: 1.5284181	total: 290ms	remaining: 28.8s
10:	learn: 1.5259446	total: 313ms	remaining: 28.1s
11:	learn: 1.5230419	total: 337ms	remaining: 27.7s
12:	learn: 1.5197319	total: 362ms	remaining: 27.5s
13:	learn: 1.5173802	total: 381ms	remaining: 26.8s
14:	learn: 1.5151364	total: 399ms	remaining: 26.2s
15:	learn: 1.5132302	total: 418ms	remaining: 25.7s
16:	learn: 1.5112905	total: 437ms	remaining: 25.3s
17:	learn: 1.5095484	total: 456ms	remaining: 24.9s
18:	learn: 1.5080969	total: 474ms	remaining: 24.5s
19:	learn: 1

<catboost.core.CatBoostClassifier at 0x16526a9d0>

In [6]:
# for y_test without dummies

# cat_features_indices = [X_train.columns.get_loc(col) for col in [
#     'airline', 'std_day', 'sta_day', 'std_month', 'sta_month',
#     'iso_country_dep', 'type_dep',
#     'iso_country_arr', 'type_arr', 'is_same_country',
# ]]

# # Initialize the CatBoostRegressor with indices
# model = CatBoostRegressor(cat_features=cat_features_indices)

# # Fit the model
# model.fit(X_train, y_train)

In [18]:
# Get feature importance
feature_importance = model.get_feature_importance()

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importance
print(feature_importance_df.head(20))

                              Feature  Importance
158         passthrough__std_time_dec   27.186395
152  one_hot__type_dep_medium_airport    4.441372
99        one_hot__iso_country_dep_TN    3.240976
151   one_hot__type_dep_large_airport    2.801645
29          one_hot__std_month_August    2.414249
68        one_hot__iso_country_dep_FR    2.316068
155  one_hot__type_arr_medium_airport    2.210377
154   one_hot__type_arr_large_airport    2.169465
148       one_hot__iso_country_arr_TN    2.132640
157      passthrough__is_same_country    1.984174
118       one_hot__iso_country_arr_FR    1.761564
17            one_hot__std_day_Sunday    1.403861
16          one_hot__std_day_Saturday    1.386273
18          one_hot__std_day_Thursday    1.256389
19           one_hot__std_day_Tuesday    1.248895
33            one_hot__std_month_July    1.199566
38         one_hot__std_month_October    1.191360
14            one_hot__std_day_Friday    1.183517
32         one_hot__std_month_January    1.164636


In [19]:
# Make predictions
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [20]:
# Evaluate on the test set
print("Test Set Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))


Test Set Results:
Accuracy: 0.3930401366353544

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.52      0.55      0.54      4858
           1       0.53      0.37      0.44      6904
           2       0.34      0.30      0.32      3741
           3       0.18      0.26      0.21      1798
           4       0.18      0.37      0.24      1435

    accuracy                           0.39     18736
   macro avg       0.35      0.37      0.35     18736
weighted avg       0.43      0.39      0.40     18736


Confusion Matrix (Test):
[[2689 1026  313  353  477]
 [1528 2539 1288  745  804]
 [ 435  782 1141  717  666]
 [ 266  252  359  463  458]
 [ 222  161  245  275  532]]


In [21]:
# Evaluate on the training set
print("\nTraining Set Results:")
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("\nClassification Report (Train):")
print(classification_report(y_train, y_pred_train))
print("\nConfusion Matrix (Train):")
print(confusion_matrix(y_train, y_pred_train))


Training Set Results:
Accuracy: 0.45313514683599104

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.51      0.59      0.55     27718
           1       0.39      0.38      0.38     27718
           2       0.41      0.38      0.40     27718
           3       0.45      0.40      0.42     27718
           4       0.49      0.52      0.50     27718

    accuracy                           0.45    138590
   macro avg       0.45      0.45      0.45    138590
weighted avg       0.45      0.45      0.45    138590


Confusion Matrix (Train):
[[16475  5407  1733  1907  2196]
 [ 5938 10462  4902  3072  3344]
 [ 2931  5113 10512  4542  4620]
 [ 3421  3438  4804 11021  5034]
 [ 3659  2596  3419  3714 14330]]


## with hyperparameter search

In [11]:
# # Initialize the CatBoostClassifier
# model_hp = CatBoostClassifier(verbose=0, random_state=42)

# # Define the parameter grid for random search
# param_grid = {
#     'iterations': np.arange(100, 500, 50),  # Number of boosting iterations
#     'learning_rate': np.linspace(0.01, 0.3, 10),  # Learning rate
#     'depth': np.arange(4, 10),  # Depth of the trees
#     'l2_leaf_reg': np.arange(1, 10),  # L2 regularization coefficient
# }

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=model_hp,
#     param_distributions=param_grid,
#     n_iter=50,  # Number of random combinations to try
#     scoring='accuracy',  # Use accuracy as the scoring metric
#     cv=5,  # 5-fold cross-validation
#     verbose=1,
#     random_state=42,
#     n_jobs=-1  # Use all available cores for parallel computation
# )

In [12]:
# # Fit RandomizedSearchCV on the training data
# random_search.fit(X_train, y_train)

# # Display the best parameters and best score
# print("Best Parameters:", random_search.best_params_)
# print("Best Accuracy Score:", random_search.best_score_)

In [13]:
# # Use the best model for predictions
# best_model = random_search.best_estimator_

In [14]:
# # Get feature importance
# feature_importance = best_model.get_feature_importance()

# # Create a DataFrame for better visualization
# feature_importance_df = pd.DataFrame({
#     'Feature': X_train.columns,
#     'Importance': feature_importance
# })

# # Sort the DataFrame by importance
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# # Display the feature importance
# print(feature_importance_df.head(20))

In [15]:
# # Make predictions
# y_pred_test = best_model.predict(X_test)
# y_pred_train = best_model.predict(X_train)

In [16]:
# # Evaluate on the test set
# print("Test Set Results:")
# print("Accuracy:", accuracy_score(y_test, y_pred_test))
# print("\nClassification Report (Test):")
# print(classification_report(y_test, y_pred_test))
# print("\nConfusion Matrix (Test):")
# print(confusion_matrix(y_test, y_pred_test))

In [17]:
# # Evaluate on the training set
# print("\nTraining Set Results:")
# print("Accuracy:", accuracy_score(y_train, y_pred_train))
# print("\nClassification Report (Train):")
# print(classification_report(y_train, y_pred_train))
# print("\nConfusion Matrix (Train):")
# print(confusion_matrix(y_train, y_pred_train))