In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, LinearRegression
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

RSEED = 42

In [2]:
# Load the dataset
df_real_train_final = pd.read_csv('data/data_with_distance.csv')
df_real_test_final = pd.read_csv('data/Test.csv')

In [5]:
y = df_real_train_final['target']
X = df_real_train_final.drop(columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
class StackingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model

    def fit(self, X, y):
        self.fitted_base_models = []
        for model in self.base_models:
            # Convert to dense if model needs it
            if isinstance(model, (Ridge, SVR)):
                fitted = model.fit(X if not hasattr(X, "toarray") else X.toarray(), y)
            else:
                fitted = model.fit(X, y)
            self.fitted_base_models.append(fitted)
        base_preds = np.column_stack([
            m.predict(X if not hasattr(X, "toarray") or not isinstance(m, (Ridge, SVR)) else X.toarray())
            for m in self.fitted_base_models
        ])
        self.meta_model.fit(base_preds, y)
        return self

    def transform(self, X):
        base_preds = np.column_stack([
            m.predict(X if not hasattr(X, "toarray") or not isinstance(m, (Ridge, SVR)) else X.toarray())
            for m in self.fitted_base_models
        ])
        return base_preds

    def predict(self, X):
        base_preds = self.transform(X)
        return self.meta_model.predict(base_preds)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Custom transformer for encoding and scaling
class EncodeScaleTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, num_col, cat_col):
        self.num_col = num_col
        self.cat_col = cat_col
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.scaler = StandardScaler()
        
    def fit(self, X, y=None):
        self.encoder.fit(X[self.cat_col])
        self.scaler.fit(X[self.num_col])
        return self
    
    def transform(self, X):
        X_cat = self.encoder.transform(X[self.cat_col])
        X_num = self.scaler.transform(X[self.num_col]).astype(np.float32)
        from scipy import sparse
        # Convert to dense array for compatibility with all models
        return sparse.hstack([X_num, X_cat.astype(np.float32)]).toarray()

# Custom transformer for stacking predictions
class StackingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model

    def fit(self, X, y):
        self.fitted_base_models = []
        for model in self.base_models:
            # Convert to dense if model needs it
            if isinstance(model, (Ridge, SVR)):
                fitted = model.fit(X if not hasattr(X, "toarray") else X.toarray(), y)
            else:
                fitted = model.fit(X, y)
            self.fitted_base_models.append(fitted)
        base_preds = np.column_stack([
            m.predict(X if not hasattr(X, "toarray") or not isinstance(m, (Ridge, SVR)) else X.toarray())
            for m in self.fitted_base_models
        ])
        self.meta_model.fit(base_preds, y)
        return self

    def transform(self, X):
        base_preds = np.column_stack([
            m.predict(X if not hasattr(X, "toarray") or not isinstance(m, (Ridge, SVR)) else X.toarray())
            for m in self.fitted_base_models
        ])
        return base_preds

    def predict(self, X):
        base_preds = self.transform(X)
        return self.meta_model.predict(base_preds)

# Define your columns
num_col = ['duration','dep_temp', 'dep_precip', 'dep_wind', 'arr_temp',
       'arr_precip', 'arr_wind', 'holiday_length', 'num_passenger_year', 'distance_km']
cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code','dep_hour',
       'dep_day', 'dep_month', 'dep_dayofweek', 'dep_quarter', 'dep_season',
       'dep_is_weekend', 'dep_time_of_day', 'arr_hour', 'arr_day', 'arr_month',
       'arr_dayofweek', 'arr_quarter', 'arr_season', 'arr_is_weekend',
       'arr_time_of_day', 'route', 'is_holiday', 'Country', 'City', 'aircraft_model']

# Define your base models and meta model (use already tuned/trained models)
base_models = [
    RandomForestRegressor(random_state=RSEED),
    XGBRegressor(objective='reg:squarederror', random_state=RSEED),
    Ridge(random_state=RSEED),
    KNeighborsRegressor(),
    LGBMRegressor(random_state=RSEED),
    CatBoostRegressor(verbose=0, random_state=RSEED),
    AdaBoostRegressor(n_estimators=100, random_state=RSEED),
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RSEED),
    SVR()
]
meta_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Build the pipeline
stacking_pipeline = Pipeline([
    ('encode_scale', EncodeScaleTransformer(num_col, cat_col)),
    ('stacking', StackingTransformer(base_models, meta_model))
])

# Fit on train data
stacking_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = stacking_pipeline.predict(df_real_test_final)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3680
[LightGBM] [Info] Number of data points in the train set: 79797, number of used features: 804
[LightGBM] [Info] Start training from score 9.881625


In [None]:
# Ensure predictions are non-negative
y_pred[y_pred < 0] = 0
y_train = y_train.clip(lower=0)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')
print(f"Stacking RMSE: {rmse:.2f}")