In [None]:
# Imports
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

train_set = pd.read_csv("./data/cleaned_train.csv")
val_set = pd.read_csv("./data/cleaned_val.csv")
test_df = pd.read_csv("./data/cleaned_test.csv")
y_train = pd.read_csv("./data/y_train.csv")
y_val = pd.read_csv("./data/y_val.csv")

# Concatenate train + val
X_full = pd.concat([train_set.drop(columns=["id"]), val_set.drop(columns=["id"])])
y_full = pd.concat([y_train, y_val])

# Categorical features
categorical_cols = ["advertiser", "subtype", "energy_label", "province"]
for col in categorical_cols:
    X_full[col] = X_full[col].astype("category")
    test_df[col] = test_df[col].astype("category")

X_test = test_df.drop(columns=["id"])

# Use best parameters found earlier
# Below is the code for hyperparameter tuning using RandomizedSearchCV from a previous attempt.

# Define parameter distributions
# param_dist = {
#     'learning_rate': st.uniform(0.01, 0.1),
#     'num_leaves': st.randint(20, 50),
#     'max_depth': st.randint(5, 15),
#     'n_estimators': st.randint(500, 2000),
#     'subsample': st.uniform(0.6, 0.9),
#     'colsample_bytree': st.uniform(0.6, 0.9),
#     'min_child_samples': st.randint(10, 50),
#     'reg_alpha': st.uniform(0, 10),
#     'reg_lambda': st.uniform(0, 10)
# }
# lgb_model = LGBMRegressor(objective="regression", boosting_type="gbdt", random_state=42)
# # Randomized Search
# random_search = RandomizedSearchCV(lgb_model, param_dist, n_iter=50, scoring="neg_mean_absolute_error", cv=3, verbose=2, n_jobs=-1, random_state=42)
# random_search.fit(X_train, y_train)
# # Best parameters
# print("Best Parameters:", random_search.best_params_)

best_params = {
    'learning_rate': 0.1094550510797341,
    'num_leaves': 47,
    'max_depth': 9,
    'min_child_samples': 15,
    'subsample': 0.9853657334855829,
    'colsample_bytree': 0.6392433945789904,
    'reg_alpha': 0.15456616528867428,
    'reg_lambda': 9.283185625877254,
    'n_estimators': 598,
    'random_state': 42
}

# Train three quantile regressors (Lower, Median, Upper)
lgb_lower = LGBMRegressor(objective='quantile', alpha=0.1, **best_params)
lgb_upper = LGBMRegressor(objective='quantile', alpha=0.9, **best_params)
lgb_median = LGBMRegressor(objective='quantile', alpha=0.5, **best_params)

lgb_lower.fit(X_full, y_full)
lgb_upper.fit(X_full, y_full)
lgb_median.fit(X_full, y_full)

# Predict on test
lower_bound = lgb_lower.predict(X_test)
upper_bound = lgb_upper.predict(X_test)
y_pred_test = lgb_median.predict(X_test)

# Fix bounds
lower_bound = np.minimum(lower_bound, upper_bound)
upper_bound = np.maximum(lower_bound, upper_bound)

# Ensure PRED is within bounds
y_pred_test = np.clip(y_pred_test, lower_bound, upper_bound)

# Final Submission File
submission = pd.DataFrame({
    "ID": test_df["id"],
    "LOWER": lower_bound,
    "UPPER": upper_bound,
    "PRED": y_pred_test
})

# Check bounds again
mask_invalid = (submission["LOWER"] > submission["UPPER"]) | \
               (submission["PRED"] < submission["LOWER"]) | \
               (submission["PRED"] > submission["UPPER"])
if mask_invalid.any():
    print("Invalid predictions in the submission file.")

submission.to_csv("finall_submission.csv", index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2731
[LightGBM] [Info] Number of data points in the train set: 26983, number of used features: 12
[LightGBM] [Info] Start training from score 199000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2731
[LightGBM] [Info] Number of data points in the train set: 26983, number of used features: 12
[LightGBM] [Info] Start training from score 544008.437500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is