In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, LinearRegression
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


RSEED = 42

In [None]:
df_train_prepro = pd.read_csv('data/data_train.csv')

In [None]:
y = df_train_prepro['target']
X = df_train_prepro.drop(columns=['target'])

In [None]:
num_col = ['duration','dep_temp', 'dep_precip', 'dep_wind', 'arr_temp',
       'arr_precip', 'arr_wind', 'holiday_length', 'num_passenger_year', 
       'distance_km', 'expected_duration', 'delay_relative_to_expected', 
       'duration_ratio', 'dep_lat', 'dep_long', 'arr_lat', 'arr_long']
cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code','dep_hour',
       'dep_day', 'dep_month', 'dep_dayofweek', 'dep_quarter', 'dep_season',
       'dep_is_weekend', 'dep_time_of_day', 'arr_hour', 'arr_day', 'arr_month',
       'arr_dayofweek', 'arr_quarter', 'arr_season', 'arr_is_weekend',
       'arr_time_of_day', 'route', 'is_holiday', 'Country', 'City', 'aircraft_model']

In [None]:
# Use sparse output for OneHotEncoder to save memory
encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

# Fit and transform categorical columns (sparse matrix)
X_cat_sparse = encoder.fit_transform(X[cat_col])

# Scale only the numerical columns and convert to float32
X_num_scaled = scaler.fit_transform(X[num_col]).astype(np.float32)

# Convert sparse matrix to float32 and combine with numerical features
X_encoded_scaled = sparse.hstack([X_num_scaled, X_cat_sparse.astype(np.float32)]).tocsr()

# Split the encoded and scaled data
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
    X_encoded_scaled, y, stratify=y, test_size=0.2, random_state=RSEED
)

In [None]:
# Get indices for the split
train_idx, test_idx = train_test_split(
    np.arange(len(X)), stratify=y, test_size=0.2, random_state=RSEED
)

X_train_1_raw = X.iloc[train_idx]
X_train_2_raw = X.iloc[test_idx]
y_train_1_raw = y.iloc[train_idx]
y_train_2_raw = y.iloc[test_idx]

# Get categorical column indices for CatBoost
cat_features_idx = [X.columns.get_loc(col) for col in cat_col]

In [None]:
# Base models - HYPE
random_forest_hype = RandomForestRegressor(max_depth=30, 
                                      max_features='auto', 
                                      min_samples_leaf=2,
                                      min_samples_split=20, 
                                      n_estimators=300, 
                                      random_state=RSEED)
xgb_hype = XGBRegressor(objective='reg:squarederror',  
                   colsample_bytree=0.5111, 
                   gamma=3.6609, 
                   learning_rate=0.0583, 
                   max_depth=10, 
                   n_estimators=266, 
                   reg_lambda=9.6965, 
                   subsample=0.5241,
                   random_state=RSEED)
ridge_hype = Ridge(alpha=1.5, 
                   random_state=RSEED, 
                   solver="sag")
knn_hype = KNeighborsRegressor(weights='distance', 
                               p=1, 
                               n_neighbors=28)
lgbm_hype = LGBMRegressor(subsample=0.8, 
                     reg_lambda=1.0, 
                     reg_alpha=1.0, 
                     num_leaves=63, 
                     n_estimators=300, 
                     max_depth=-1, 
                     learning_rate=0.05, 
                     colsample_bytree=1.0,
                     random_state=RSEED)
catboost_hype = CatBoostRegressor(random_strength=10, 
                             learning_rate=0.1, 
                             l2_leaf_reg=9, 
                             iterations=500, 
                             depth=8, 
                             border_count=64, 
                             bagging_temperature=0.5,
                             random_state=RSEED)
adaboost_hype = AdaBoostRegressor(estimator=XGBRegressor(max_depth=5),
                                  learning_rate=0.1,
                                  n_estimators=50,
                                  random_state=RSEED)
gbr_hype = GradientBoostingRegressor(n_estimators=200, 
                                learning_rate=0.1, 
                                max_depth=7,
                                subsample=0.8, 
                                random_state=RSEED)
svr_hype = SVR(C=41.54172090104322, 
               epsilon=0.001,
               gamma=0.0020588729828704562)

In [None]:
# Meta-model to combine all the base models - HYPE
meta_xgb_hype = XGBRegressor(objective='reg:squarederror', random_state=42)

In [None]:
#Fit Random Forest
rf_model_hype = random_forest_hype.fit(X_train_1, y_train_1) 
#Fit XGBoost
xgb_model_hype = xgb_hype.fit(X_train_1, y_train_1)
# Fit Ridge Convert sparse matrix to dense for Ridge regression
ridge_model_hype = ridge_hype.fit(X_train_1.toarray(), y_train_1)
# fit KNN KNeighborsRegressor does not support sparse input, so convert to dense
knn_model_hype = knn_hype.fit(X_train_1, y_train_1)
# Fit LightGBM
lgbm_model_hype = lgbm_hype.fit(X_train_1, y_train_1)
# Fit CatBoost
# Drop datetime columns that CatBoost cannot handle
datetime_cols = ['departure_time', 'arrival_time', 'departure_date', 'arrival_date']
X_train_1_raw_catboost = X_train_1_raw.drop(columns=datetime_cols)

# Update categorical feature indices for the new dataframe
cat_features_idx_catboost = [X_train_1_raw_catboost.columns.get_loc(col) for col in cat_col if col in X_train_1_raw_catboost.columns]

catboost_model_hype = catboost_hype.fit(X_train_1_raw_catboost, y_train_1_raw, cat_features=cat_features_idx_catboost)
# Fit AdaBoost on encoded/scaled training data
adaboost_model_hype = adaboost_hype.fit(X_train_1, y_train_1)
# Fit GradientBoostingRegressor on encoded/scaled training data
gbr_model_hype = gbr_hype.fit(X_train_1, y_train_1)
# Fit support vector regression - SVR does not support sparse input, so convert to dense
svr_model_hype = svr_hype.fit(X_train_1.toarray(), y_train_1)

In [None]:
rf_pred_hype = rf_model_hype.predict(X_train_2)
xgb_pred_hype = xgb_model_hype.predict(X_train_2)
ridge_pred_hype = ridge_model_hype.predict(X_train_2)
knn_pred_hype = knn_model_hype.predict(X_train_2)
lgbm_pred_hype = lgbm_model_hype.predict(X_train_2)
catboost_pred_hype = catboost_model_hype.predict(X_train_2_raw.drop(columns=datetime_cols))
adaboost_pred_hype = adaboost_model_hype.predict(X_train_2)
gbr_pred_hype = gbr_model_hype.predict(X_train_2)
svr_pred_hype = svr_model_hype.predict(X_train_2.toarray())

# Combine base model predictions for meta-model input
combine_X_pred_test_hype = pd.concat([
	pd.DataFrame(rf_pred_hype),
	pd.DataFrame(xgb_pred_hype),
	pd.DataFrame(ridge_pred_hype),
    pd.DataFrame(knn_pred_hype),
    pd.DataFrame(lgbm_pred_hype),
    pd.DataFrame(catboost_pred_hype),
    pd.DataFrame(adaboost_pred_hype),
    pd.DataFrame(gbr_pred_hype),
    pd.DataFrame(svr_pred_hype)
], axis=1)

In [None]:
# Ensure unique column names for stacking features
combine_X_pred_test_hype.columns = [f'hype_model_{i}' for i in range(combine_X_pred_test_hype.shape[1])]

meta_xgb_hype.fit(combine_X_pred_test_hype, y_train_2)

In [None]:
y_pred_hype = meta_xgb_hype.predict(combine_X_pred_test_hype)

In [None]:
# Ensure predictions are non-negative
y_pred_hype[y_pred_hype < 0] = 0
y_train_2 = y_train_2.clip(lower=0)

In [None]:
mse_hype = mean_squared_error(y_train_2, y_pred_hype)
r2_hype = r2_score(y_train_2, y_pred_hype)
rmse_hype = np.sqrt(mean_squared_error(y_train_2, y_pred_hype))
print(f'Hype Mean Squared Error: {mse_hype}')
print(f'Hype R2 Score: {r2_hype}')
print(f"Hype Stacking RMSE: {rmse_hype:.2f}")

In [None]:
rf_pred_hype[rf_pred_hype < 0] = 0
xgb_pred_hype[xgb_pred_hype < 0] = 0
ridge_pred_hype[ridge_pred_hype < 0] = 0
knn_pred_hype[knn_pred_hype < 0] = 0
lgbm_pred_hype[lgbm_pred_hype < 0] = 0
catboost_pred_hype[catboost_pred_hype < 0] = 0
adaboost_pred_hype[adaboost_pred_hype < 0] = 0
gbr_pred_hype[gbr_pred_hype < 0] = 0
svr_pred_hype[svr_pred_hype < 0] = 0

print(f"Hype RF RMSE: {np.sqrt(mean_squared_error(y_train_2, rf_pred_hype))}")
print(f"Hype XGB RMSE: {np.sqrt(mean_squared_error(y_train_2, xgb_pred_hype))}")
print(f"Hype Ridge RMSE: {np.sqrt(mean_squared_error(y_train_2, ridge_pred_hype))}")
print(f"Hype KNN RMSE: {np.sqrt(mean_squared_error(y_train_2, knn_pred_hype))}")
print(f"Hype LGBM RMSE: {np.sqrt(mean_squared_error(y_train_2, lgbm_pred_hype))}")
print(f"Hype CatBoost RMSE: {np.sqrt(mean_squared_error(y_train_2, catboost_pred_hype))}")
print(f"Hype AdaBoost RMSE: {np.sqrt(mean_squared_error(y_train_2, adaboost_pred_hype))}")
print(f"Hype GBR RMSE: {np.sqrt(mean_squared_error(y_train_2, gbr_pred_hype))}")
print(f"Hype SVR RMSE: {np.sqrt(mean_squared_error(y_train_2, svr_pred_hype))}")

In [None]:
encoded_train_df = pd.DataFrame(X_encoded_scaled.toarray(), columns=encoder.get_feature_names_out(cat_col))

In [None]:
encoded_test_data = pd.read_csv('data/encoded_test_data.csv')

In [None]:
# Convert DataFrame to numpy array, then to sparse CSR matrix
X_encoded_test_sparse = sparse.csr_matrix(encoded_test_data.values)

In [None]:
encoded_test_df = pd.DataFrame(X_encoded_test_sparse.toarray(), columns=encoded_test_data.columns)

In [None]:
# Ensure test DataFrame has same columns as train DataFrame after one-hot encoding

def align_test_to_train(train_df, test_df):
    # Add missing columns to test_df, fill with 0
    missing_cols = set(train_df.columns) - set(test_df.columns)
    for col in missing_cols:
        test_df[col] = 0

    # Remove extra columns from test_df
    extra_cols = set(test_df.columns) - set(train_df.columns)
    test_df = test_df.drop(columns=extra_cols)

    # Reorder columns to match train_df
    test_df = test_df[train_df.columns]
    return test_df

# Convert sparse matrices to DataFrames using correct column names
encoded_cat_cols = encoder.get_feature_names_out(cat_col)
all_feature_names = num_col + list(encoded_cat_cols)

# Convert sparse matrices to DataFrames using correct column names
train_df = pd.DataFrame(X_encoded_scaled.toarray(), columns=all_feature_names)
test_df = pd.DataFrame(X_encoded_test_sparse.toarray(), columns=encoded_test_data.columns)

aligned_test_df = align_test_to_train(train_df, test_df)

In [None]:
# Prepare raw test data for CatBoost prediction
# 1. Load your raw test data (not encoded)
raw_test_data = pd.read_csv('data/data_test.csv')  # Update path/filename as needed

# 2. Drop datetime columns that CatBoost cannot handle
datetime_cols = ['departure_time', 'arrival_time', 'departure_date', 'arrival_date']
raw_test_data_catboost = raw_test_data.drop(columns=datetime_cols)

# 3. Get categorical feature indices for CatBoost
cat_features_idx_catboost = [raw_test_data_catboost.columns.get_loc(col) for col in cat_col if col in raw_test_data_catboost.columns]

In [None]:
# Make sure test columns match training columns exactly
raw_test_data_catboost = raw_test_data_catboost[X_train_1_raw_catboost.columns]

# Ensure categorical columns are string type
for col in cat_col:
    if col in raw_test_data_catboost.columns:
        raw_test_data_catboost[col] = raw_test_data_catboost[col].astype(str)

In [None]:
# Generate base model predictions for the test set
rf_pred_test = rf_model_hype.predict(aligned_test_df)
xgb_pred_test = xgb_model_hype.predict(aligned_test_df)
ridge_pred_test = ridge_model_hype.predict(aligned_test_df.to_numpy())
knn_pred_test = knn_model_hype.predict(aligned_test_df)
lgbm_pred_test = lgbm_model_hype.predict(aligned_test_df)
catboost_pred_test = catboost_model_hype.predict(raw_test_data_catboost)
adaboost_pred_test = adaboost_model_hype.predict(aligned_test_df)
gbr_pred_test = gbr_model_hype.predict(aligned_test_df)
svr_pred_test = svr_model_hype.predict(aligned_test_df.to_numpy())

# Combine predictions into a DataFrame with correct column names
combine_X_pred_test = pd.DataFrame({
	'hype_model_0': rf_pred_test,
	'hype_model_1': xgb_pred_test,
	'hype_model_2': ridge_pred_test,
	'hype_model_3': knn_pred_test,
	'hype_model_4': lgbm_pred_test,
	'hype_model_5': catboost_pred_test,
	'hype_model_5': adaboost_pred_test,
	'hype_model_6': gbr_pred_test,
	'hype_model_7': svr_pred_test
})

In [None]:
# Ensure combine_X_pred_test has all 9 base model predictions with unique column names
combine_X_pred_test = pd.DataFrame({
	'hype_model_0': rf_pred_test,
	'hype_model_1': xgb_pred_test,
	'hype_model_2': ridge_pred_test,
	'hype_model_3': knn_pred_test,
	'hype_model_4': lgbm_pred_test,
	'hype_model_5': catboost_pred_test,  # CatBoost predictions (raw_test_data_catboost)
	'hype_model_6': adaboost_pred_test,
	'hype_model_7': gbr_pred_test,
	'hype_model_8': svr_pred_test
})

In [None]:
# Generate base model predictions for the test set
rf_pred_test = rf_model_hype.predict(aligned_test_df)
xgb_pred_test = xgb_model_hype.predict(aligned_test_df)
ridge_pred_test = ridge_model_hype.predict(aligned_test_df.to_numpy())
knn_pred_test = knn_model_hype.predict(aligned_test_df)
lgbm_pred_test = lgbm_model_hype.predict(aligned_test_df)
catboost_pred_test = catboost_model_hype.predict(raw_test_data_catboost, cat_features=cat_features_idx_catboost)
# CatBoost expects raw features, not encoded; skip or use raw test if available
adaboost_pred_test = adaboost_model_hype.predict(aligned_test_df)
gbr_pred_test = gbr_model_hype.predict(aligned_test_df)
svr_pred_test = svr_model_hype.predict(aligned_test_df.to_numpy())

# Combine predictions into a DataFrame with correct column names
combine_X_pred_test = pd.DataFrame({
	'hype_model_0': rf_pred_test,
	'hype_model_1': xgb_pred_test,
	'hype_model_2': ridge_pred_test,
	'hype_model_3': knn_pred_test,
	'hype_model_4': lgbm_pred_test,
	'hype_model_5': catboost_pred_test, # Only if you have raw test data for CatBoost
	'hype_model_5': adaboost_pred_test,
	'hype_model_6': gbr_pred_test,
	'hype_model_7': svr_pred_test
})

In [None]:
# Predict using the meta-model
y_pred_test = meta_xgb_hype.predict(combine_X_pred_test)

In [None]:
# Ensure predictions are non-negative
y_pred_test[y_pred_test < 0] = 0
y_train_2 = y_train_2.clip(lower=0)

In [None]:
y_pred_test_df = pd.DataFrame(y_pred_test, columns=['target'])

In [None]:
# Random Forest feature importances
rf_importances = rf_model_hype.feature_importances_

# XGBoost feature importances
xgb_importances = xgb_model_hype.feature_importances_

# KNN does not provide feature importances, so skip it

# Ridge feature coefficients (absolute value for importance)
ridge_importances = np.abs(ridge_model_hype.coef_)

# LightGBM feature importances
lgbm_importances = lgbm_model_hype.feature_importances_

# CatBoost feature importances
catboost_importances = catboost_hype.get_feature_importance()

# AdaBoost feature importances (using the base estimator's feature importances)
adaboost_importances = adaboost_model_hype.feature_importances_

# Gradient Boosting feature importances
gbr_importances = gbr_model_hype.feature_importances_

# Support Vector Regression does not provide feature importances, so skip it

# Get encoded categorical column names
encoded_cat_cols = encoder.get_feature_names_out(cat_col)

# Feature names
feature_names = num_col + list(encoded_cat_cols)

# Create a DataFrame for each model's importances
importances_df = pd.DataFrame({
    'feature': feature_names,
    'RandomForest': rf_importances,
    'XGBoost': xgb_importances,
    'Ridge': ridge_importances,
    'LightGBM': lgbm_importances,
    #'CatBoost': catboost_importances,
    'AdaBoost': adaboost_importances,
    'GradientBoosting': gbr_importances
})

# Show top 15 features by average importance across models
importances_df['avg_importance'] = importances_df[[
    'RandomForest',
    'XGBoost', 
    'Ridge', 
    'LightGBM',
    #'CatBoost',
    'AdaBoost',
    'GradientBoosting'
    ]].mean(axis=1)
importances_df.sort_values('avg_importance', ascending=False)