In [9]:
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
data = pd.read_parquet("../../Data/engineered/train_data_after_FE.parquet")
# data.info()

In [3]:
data.head()

Unnamed: 0_level_0,Store,CompetitionDistance,CompetitionDistanceMissing,CompetitionOpenMissing,SchoolHoliday,Promo,Promo2,StoreType,Assortment,Sales,...,Month_cos,IsWeekend,IsMonthEnd,IsMonthStart,OpenDuration,Promo2WeeksDuration,IsPromo2Month,DaysUntilNextStateHoliday,DaysSinceLastStateHoliday,DaysUntilClosed
Date_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,1,1270.0,False,False,1,False,False,c,a,8.617943,...,0.866025,False,False,False,24,0,False,14,14,7
2013-01-03,1,1270.0,False,False,1,False,False,c,a,8.37263,...,0.866025,False,False,False,24,0,False,14,14,7
2013-01-04,1,1270.0,False,False,1,False,False,c,a,8.408717,...,0.866025,False,False,False,24,0,False,14,14,7
2013-01-05,1,1270.0,False,False,1,False,False,c,a,8.516593,...,0.866025,True,False,False,24,0,False,14,14,7
2013-01-07,1,1270.0,False,False,1,True,False,c,a,8.878497,...,0.866025,False,False,False,24,0,False,14,14,7


## Choose data types

In [4]:
cat_cols = ["StoreType", "Assortment"] #2
bool_cols = [
    "CompetitionDistanceMissing",
    "CompetitionOpenMissing",
    "Promo",
    "Promo2",
    "IsLastDayOfMonth",
    "IsWeekend",
    "IsMonthEnd",
    "IsMonthStart",
    "IsPromo2Month",
] # 9 
num_cols = [c for c in data.columns if c not in cat_cols + bool_cols + ["Sales"]]

## Preprocessing pipeline

In [5]:
num_pipe = SimpleImputer(strategy="median")

# Ordinal Encode (Best for Tree based models) + Handle Unknowns
cat_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        (
            "encoder",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
        ),
    ]
)

preprocessor = ColumnTransformer(
    [("num", num_pipe, bool_cols + num_cols), ("cat", cat_pipe, cat_cols)]
)

## Splitting the data

In [6]:
data.index = pd.to_datetime(data.index)
max_date = data.index.max()
split_date = max_date - pd.Timedelta(days=42)

print(f"Interval for testing the upcoming 6 weeks from {split_date} to {max_date}")
# X = data[cat_cols + bool_cols + num_cols]
# y = data["Sales"]
# del data

Interval for testing the upcoming 6 weeks from 2015-06-19 00:00:00 to 2015-07-31 00:00:00


In [7]:
train = data.loc[data.index <= split_date].copy()
test = data.loc[data.index > split_date].copy()

X_train = train.drop(columns = 'Sales')
y_train = train.Sales
x_test = test.drop(columns = 'Sales')
y_test = test.Sales

print(X_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(804056, 28)
(804056,)
(40282, 28)
(40282,)


## Model Building and training

In [None]:
xgb_baseline = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=20,
    n_jobs=-1,
    random_state=42,
    objective="reg:squarederror",
)

main_pipeline = Pipeline([("preprocessor", preprocessor), ("model", xgb_baseline)])
main_pipeline.fit(X_train, y_train)

## Base model evaluation

In [None]:
y_test_original = np.expm1(y_test)
y_predicted_log = main_pipeline.predict(x_test)

y_predicted = np.expm1(y_predicted_log)

rmse = np.sqrt(mean_squared_error(y_test_original, y_predicted))
mae = mean_absolute_error(y_test_original, y_predicted)
r2 = r2_score(y_test_original, y_predicted)

print(f"\n--- Evaluation on Last 6 Weeks Test Set ---")
print(f"RMSE (Regression): {rmse:.4f}")
print(f"MAE (Regression):  {mae:.4f}")
print(f"RÂ² Score (Fit):    {r2:.2f}")


--- Evaluation on Last 6 Weeks Test Set ---
RMSE (Regression): 1090.1369
MAE (Regression):  721.2820
RÂ² Score (Fit):    0.87


#### Hyperparameter Tuning and Model Enhancements / Modifications (for bonus, kindly give us the full markðŸ˜Š) 


In [18]:
xgb = XGBRegressor(n_jobs=-1, random_state=42, enable_categorical=True)
main_pipeline = Pipeline([("preprocessor", preprocessor), ("model", xgb)])

param_dist = {
    "model__n_estimators": [500, 800],
    "model__learning_rate": [0.01, 0.1],
    "model__max_depth": [8, 10, 15, 20],
    "model__subsample": [0.7, 0.8, 0.9],
    "model__colsample_bytree": [0.7, 0.8],
}

tscv = TimeSeriesSplit(n_splits=5)

search = RandomizedSearchCV(
    main_pipeline,
    param_distributions=param_dist,
    n_iter=15,
    cv=tscv,  # Time series validation
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1,
)

search.fit(X_train, y_train)

print(f"Best Parameters: {search.best_params_}")
print(f"Best CV Score (RMSE): {-search.best_score_}")
final_model = search.best_estimator_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


1 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\LENOVO\AppData\Roaming\Python\Python313\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~

Best Parameters: {'model__subsample': 0.7, 'model__n_estimators': 500, 'model__max_depth': 8, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}
Best CV Score (RMSE): 0.37976205348968506


In [None]:
xgb_baseline = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=20,
    n_jobs=-1,
    random_state=42,
    subsample = 0.7,
    colsample_bytree = 0.7,
    objective="reg:squarederror",
)

main_pipeline = Pipeline([("preprocessor", preprocessor), ("model", xgb_baseline)])
main_pipeline.fit(X_train, y_train)

In [None]:
y_test_original = np.expm1(y_test)
y_predicted_log = main_pipeline.predict(x_test)

y_predicted = np.expm1(y_predicted_log)

rmse = np.sqrt(mean_squared_error(y_test_original, y_predicted))
mae = mean_absolute_error(y_test_original, y_predicted)
r2 = r2_score(y_test_original, y_predicted)

print(f"\n--- Evaluation on Last 6 Weeks Test Set ---")
print(f"RMSE (Regression): {rmse:.4f}")
print(f"MAE (Regression):  {mae:.4f}")
print(f"RÂ² Score (Fit):    {r2:.2f}")