In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


import numpy as np
import pandas as pd

RSEED = 42

In [None]:
# 1. Load dataset
df = pd.read_csv('data_new/preprocessed_train_data_with_date_hol_concat.csv')
df.columns
df.isna().sum()

In [None]:
y = df['target']
X = df.drop(columns=['target'])

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Define the categorical features
num_col = ['duration','dep_temp', 'dep_precip', 'dep_wind', 'arr_temp',
       'arr_precip', 'arr_wind', 'holiday_length']
cat_col = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code','dep_hour',
       'dep_day', 'dep_month', 'dep_dayofweek', 'dep_quarter', 'dep_season',
       'dep_is_weekend', 'dep_time_of_day', 'arr_hour', 'arr_day', 'arr_month',
       'arr_dayofweek', 'arr_quarter', 'arr_season', 'arr_is_weekend',
       'arr_time_of_day', 'route', 'is_holiday', 'Country', 'City']



# Use sparse output for OneHotEncoder to save memory
encoder = OneHotEncoder(sparse=True, handle_unknown='ignore')
scaler = StandardScaler()

# Fit and transform categorical columns (sparse matrix)
X_cat_sparse = encoder.fit_transform(X[cat_col])

# Scale only the numerical columns and convert to float32
X_num_scaled = scaler.fit_transform(X[num_col]).astype(np.float32)

# Convert sparse matrix to float32 and combine with numerical features
from scipy import sparse
X_encoded_scaled = sparse.hstack([X_num_scaled, X_cat_sparse.astype(np.float32)]).tocsr()

# Split the encoded and scaled data
X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
    X_encoded_scaled, y, stratify=y, test_size=0.2, random_state=RSEED
)

In [None]:
X_encoded_scaled.shape

In [19]:
# 2. Train-test split
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_col)
    ])


# -------------------
# BAGGING REGRESSOR
# -------------------
bagging_regressor = BaggingRegressor(
    base_estimator=DecisionTreeRegressor(),
    n_estimators=50,
    random_state=42
)

bagging_regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', bagging_regressor)
])



bagging_regressor_pipeline.fit(X_train, y_train)
bagging_preds = bagging_regressor_pipeline.predict(X_test)
bagging_mse = mean_squared_error(y_test, bagging_preds)
print(f"Bagging Regressor MSE: {bagging_mse:.3f}")

# -------------------
# BOOSTING REGRESSOR
# -------------------
boosting_regressor = GradientBoostingRegressor(
    n_estimators=50,
    learning_rate=0.1,
    random_state=42
)

boosting_regressor_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', boosting_regressor)
])
boosting_regressor_pipeline.fit(X_train, y_train)
boosting_preds = boosting_regressor_pipeline.predict(X_test)
boosting_mse = mean_squared_error(y_test, boosting_preds)
print(f"Boosting Regressor MSE: {boosting_mse:.3f}")

Bagging Regressor MSE: 2249.129
Boosting Regressor MSE: 2388.588


In [20]:
rmse = np.sqrt(mean_squared_error(y_test, boosting_preds))
print(f"Boosting RMSE: {rmse:.2f}")

rmse = np.sqrt(mean_squared_error(y_test, bagging_preds))
print(f"Begging RMSE: {rmse:.2f}")

Boosting RMSE: 48.87
Begging RMSE: 47.42


In [21]:
# -------------------
# BAGGING REGRESSOR
# -------------------
bagging_regressor = BaggingRegressor(
        base_estimator=XGBRegressor(
            n_estimators=10,
            learning_rate=0.2,
            max_depth=3,
            objective='reg:squarederror',
            random_state=42
        ),
        n_estimators=10,
        random_state=42
    )

bagging_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', bagging_regressor)
])



bagging_pipeline.fit(X_train, y_train)
bagging_preds = bagging_pipeline.predict(X_test)
bagging_mse = mean_squared_error(y_test, bagging_preds)
print(f"Bagging Regressor(XGBRegressor) MSE: {bagging_mse:.3f}")
rmse = np.sqrt(mean_squared_error(y_test, bagging_preds))
print(f"Begging (XGBRegressor) RMSE: {rmse:.2f}")




Bagging Regressor(XGBRegressor) MSE: 2467.222
Begging (XGBRegressor) RMSE: 49.67


In [22]:
# -------------------
# BOOSTING REGRESSOR with XGBRegressor
# -------------------
boosting_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        objective='reg:squarederror',
        random_state=42
    ))
])
boosting_pipeline.fit(X_train, y_train)
boosting_preds = boosting_pipeline.predict(X_test)
boosting_mse = mean_squared_error(y_test, boosting_preds)
print(f"Boosting Regressor(XGBRegressor) MSE: {boosting_mse:.3f}")
rmse = np.sqrt(mean_squared_error(y_test, boosting_preds))
print(f"Begging (XGBRegressor) RMSE: {rmse:.2f}")


Boosting Regressor(XGBRegressor) MSE: 2342.433
Begging (XGBRegressor) RMSE: 48.40


In [23]:
# Pipeline
from sklearn.model_selection import GridSearchCV


boosting_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Hyperparameter grid
boosting_param_grid = {
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__subsample': [0.8, 1.0],
    'regressor__n_estimators': [80, 100, 120],
    'regressor__max_depth': [3, 4],
}

# Grid search
boosting_search = GridSearchCV(
    boosting_pipeline,
    param_grid=boosting_param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1
)

boosting_search.fit(X_train, y_train)
boosting_preds = boosting_search.predict(X_test)
boosting_mse = mean_squared_error(y_test, boosting_preds)
print(f"Best Boosting MSE: {boosting_mse:.2f}")
print("Best Boosting Params:", boosting_search.best_params_)
rmse = np.sqrt(mean_squared_error(y_test, boosting_preds))
print(f"Begging (XGBRegressor) RMSE: {rmse:.2f}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Boosting MSE: 2292.52
Best Boosting Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 4, 'regressor__n_estimators': 120, 'regressor__subsample': 0.8}
Begging (XGBRegressor) RMSE: 47.88


In [24]:
best_pipeline = random_search.best_estimator_

# Predict and evaluate
y_pred = best_pipeline.predict(X_test)

y_pred[y_pred < 0] = 0
y_test = y_test.clip(lower=0)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Params: {random_search.best_params_}")
print(f"Test RMSE: {rmse:.2f}")

NameError: name 'random_search' is not defined

In [None]:
# Define the categorical features
num_cols = ['duration']
cat_cols = ['departure_point', 'arrival_point', 'flight_status', 'aircraft_code']


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# Define individual regressors
rf = RandomForestRegressor(n_estimators=100, random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
ridge = Ridge(alpha=1.0, solver='lsqr')
knn = KNeighborsRegressor(n_neighbors=5)

# Voting Regressor
voting_regressor = VotingRegressor(
    estimators=[
        # ('rf', rf),
        ('xgb', xgb),
        ('ridge', ridge),
        ('knn', knn)
    ]
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', voting_regressor)
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Voting Regressor RMSE: {rmse:.2f}")


In [None]:
# Define individual models
ridge = Ridge(solver='lsqr')
gb = GradientBoostingRegressor(random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Create the voting regressor
voting = VotingRegressor(estimators=[
    # ('ridge', ridge),
    # ('gb', gb),
    ('xgb', xgb)
])

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', voting)
])

# Hyperparameter space
# param_distributions = {
#     # 'regressor__ridge__alpha': [0.1, 1.0, 10.0],
#     # 'regressor__gb__n_estimators': [100, 200],
#     # 'regressor__gb__max_depth': [3, 5, 7],
#     # 'regressor__gb__learning_rate': [0.01, 0.1, 0.2],
#     'regressor__xgb__n_estimators': [100, 200],
#     'regressor__xgb__max_depth': [3, 5, 7],
#     'regressor__xgb__learning_rate': [0.01, 0.1, 0.2],
#     'regressor__xgb__subsample': [0.8, 1.0]
# }

param_distributions = {
    'regressor__xgb__n_estimators': [100, 200, 300],
    'regressor__xgb__learning_rate': [0.01, 0.05, 0.1],
    'regressor__xgb__max_depth': [3, 5, 6, 8],
    'regressor__xgb__min_child_weight': [1, 3, 5],
    'regressor__xgb__gamma': [0, 0.1, 0.2],
    'regressor__xgb__subsample': [0.7, 0.8, 1.0],
    'regressor__xgb__colsample_bytree': [0.7, 0.8, 1.0],
    'regressor__xgb__reg_alpha': [0, 0.1, 0.5],
    'regressor__xgb__reg_lambda': [1.0, 1.5, 2.0]
}

# Randomized Search
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit model
search.fit(X_train, y_train)

# Best model and evaluation
print("Best parameters:\n", search.best_params_)
y_pred = search.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.2f}")
