In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, LinearRegression
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import pandas as pd

RSEED = 42

In [2]:
df_train_prepro = pd.read_csv('data/preprocessed_train_data.csv')

In [3]:
y = df_train_prepro['target']
X = df_train_prepro.drop(columns=['target', 'Unnamed: 0'])

In [17]:
 # Define the categorical features
num_col = ['duration']
cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code']

# Fit and transform categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[cat_col])

# Get feature names for encoded columns (optional)
encoded_cat_cols = encoder.get_feature_names_out(cat_col)

# Combine with numerical columns
X_num = X[num_col].values
X_encoded = np.hstack([X_num, X_cat])

# Now split the encoded data
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_encoded, y, stratify=y, test_size=0.2, random_state=RSEED)



XGB RandomisedSearchCV

Find the best parameters for XGBRegressor

In [50]:
# Fit model to training data
xgb_best = XGBRegressor(objective='reg:squarederror', random_state=42)

In [51]:
#hyperparameter grid
xgb_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 4, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5, 1],
    'min_child_weight': [1, 3, 5, 7],
    'scale_pos_weight': [1, 2, 3]
}

xgb_ransearcv_best = RandomizedSearchCV(estimator=xgb_best, param_distributions=xgb_param_grid, 
                          n_iter=100, scoring='accuracy', cv=3, 
                          verbose=1, random_state=42, n_jobs=-1)
xgb_ransearcv_best.fit(X_train_1, y_train_1)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


OSError: [WinError 1450] Insufficient system resources exist to complete the requested service

In [None]:
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

ridge_param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}

lgbm_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [None]:
from sklearn.model_selection import GridSearchCV

# Random Forest
rf_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
rf_search.fit(X_train_1, y_train_1)
best_rf = rf_search.best_estimator_

# XGBoost
xgb_search = GridSearchCV(XGBRegressor(objective='reg:squarederror', random_state=42), xgb_param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
xgb_search.fit(X_train_1, y_train_1)
best_xgb = xgb_search.best_estimator_

# Ridge
ridge_search = GridSearchCV(Ridge(random_state=42), ridge_param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
ridge_search.fit(X_train_1, y_train_1)
best_ridge = ridge_search.best_estimator_

# LightGBM
lgbm_search = GridSearchCV(LGBMRegressor(random_state=42), lgbm_param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
lgbm_search.fit(X_train_1, y_train_1)
best_lgbm = lgbm_search.best_estimator_

Create Base models + Meta model

In [23]:
# Base models
random_forest = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
ridge = Ridge(random_state=42)
lgbm = LGBMRegressor(random_state=42)

# Meta-model
linear_model = LinearRegression()

In [5]:
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RSEED)

In [None]:
# Fit all the base estimators on the 1st half of the train dataset
rf_model = random_forest.fit(X_train_1, y_train_1)
xgb_model = xgb.fit(X_train_1, y_train_1)
ridge_model = ridge.fit(X_train_1, y_train_1)
lgbm_model = lgbm.fit(X_train_1, y_train_1)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 613
[LightGBM] [Info] Number of data points in the train set: 79797, number of used features: 211
[LightGBM] [Info] Start training from score 9.986817


In [None]:
# Then with the second half of the train dataset we predict the values from the base estimators
rf_pred = rf_model.predict(X_train_2)
xgb_pred = xgb_model.predict(X_train_2)
ridge_pred = ridge_model.predict(X_train_2)
lgbm_pred = lgbm_model.predict(X_train_2)

# Combine base model predictions for meta-model input
combine_X_pred_test = pd.concat([
	pd.DataFrame(rf_pred),
	pd.DataFrame(xgb_pred),
	pd.DataFrame(ridge_pred),
	pd.DataFrame(lgbm_pred)
], axis=1)

In [42]:
# Fit the final estimator on the combined probabilities and target values
linear_model.fit(combine_X_pred_test, y_train_2)

In [None]:
# Predict with meta-model
y_pred = linear_model.predict(combine_X_pred_test)

In [43]:
# Ensure predictions are non-negative
y_pred[y_pred < 0] = 0
y_train_2 = y_train_2.clip(lower=0)

In [44]:
# Evaluate the model
mse = mean_squared_error(y_train_2, y_pred)
r2 = r2_score(y_train_2, y_pred)
rmse = np.sqrt(mean_squared_error(y_train_2, y_pred))
print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')
print(f"Stacking RMSE: {rmse:.2f}")

Mean Squared Error: 1235.243718722728
R2 Score: 0.07648609153972563
Stacking RMSE: 35.15
