In [110]:
import numpy as np 
import pandas as pd

In [111]:
calendar_df = pd.read_csv(r"E:\environments\wallmart_hackathon\dataset\m5-forecasting-accuracy\calendar.csv")
sales_validation_df = pd.read_csv(r"E:\environments\wallmart_hackathon\dataset\m5-forecasting-accuracy\sales_train_validation.csv")
sell_prices_df = pd.read_csv(r"E:\environments\wallmart_hackathon\dataset\m5-forecasting-accuracy\sell_prices.csv")

### Data Preprocessing

In [112]:
## melting last 120 days (form d_1789 to d_1913)
## lag = 28, rolling mean = 28

start_col = 1789
end_col = 1913

col = [f'd_{x}' for x in range(start_col, end_col + 1)]

columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] + col

sub_validation_df = sales_validation_df[columns]

sales_long = sub_validation_df.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='sales'
)

print(sales_long.shape)
sales_long.head()

(3811250, 8)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,1
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,2


In [113]:
## merge with calendar

sales_long = sales_long.merge(calendar_df, how = 'left', on = 'd')

In [114]:
## convert date to datetime
sales_long['date'] = pd.to_datetime(sales_long['date'])

In [115]:
## merge with sales_prices
sales_long = sales_long.merge(sell_prices_df, how = 'left', on = ['store_id', 'item_id', 'wm_yr_wk'])

sales_long is the final dataset

In [116]:
sales_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,0,2015-12-22,11547,...,12,2015,,,,,0,0,0,8.26
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,0,2015-12-22,11547,...,12,2015,,,,,0,0,0,3.97
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,1,2015-12-22,11547,...,12,2015,,,,,0,0,0,2.97
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,1,2015-12-22,11547,...,12,2015,,,,,0,0,0,4.64
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1789,2,2015-12-22,11547,...,12,2015,,,,,0,0,0,2.88


In [None]:
sales_long = sales_long.sort_values(['id', 'date'])

## create lag of 28
sales_long['lag_28'] = (
    sales_long
    .groupby('id')['sales']
    .shift(28)
)

## create lag of 7
sales_long['lag_7'] = (
    sales_long
    .groupby('id')['sales']
    .shift(7)
)

## rolling mean of 28
sales_long['rolling_mean_28'] = (
    sales_long
    .groupby('id')['sales']
    .transform(
        lambda x: x.shift(1).rolling(window=28).mean()
    )
)

# ## percent price change feature
sales_long["price_pct_change"] = (
    sales_long.groupby("id")["sell_price"]
    .pct_change(fill_method = None).fillna(0)
)

# ## zero streaks
sales_long["zero_streak"] = (
    sales_long.groupby("id")["sales"]
    .transform(lambda x: x.eq(0).astype(int).groupby(x.ne(0).cumsum()).cumsum())
)


## calendar features
sales_long['month'] = sales_long['date'].dt.month
sales_long['year'] = sales_long['date'].dt.year

sales_long['day_of_month'] = sales_long['date'].dt.day
sales_long['week_of_month'] = ((sales_long['day_of_month'] - 1) // 7) + 1

sales_long.head()


## target columns
## Reverse the time series so rolling looks "forward"
sales_long['sales_28_sum'] = (
    sales_long
    .iloc[::-1]                                 
    .groupby('id')['sales']
    .rolling(window=28, min_periods=28)
    .sum()
    .reset_index(level=0, drop=True)
    .iloc[::-1]                               
)


In [None]:
conditions = [
    sales_long["state_id"] == "CA",
    sales_long["state_id"] == "TX",
    sales_long["state_id"] == "WI"
]

choices = [
    sales_long["snap_CA"],
    sales_long["snap_TX"],
    sales_long["snap_WI"]
]

sales_long["snap_active"] = np.select(conditions, choices, default=0)

sales_long.drop(['snap_CA', 'snap_TX', 'snap_WI'], axis = 1)

In [119]:
sales_long.isna().sum()


id                       0
item_id                  0
dept_id                  0
cat_id                   0
store_id                 0
state_id                 0
d                        0
sales                    0
date                     0
wm_yr_wk                 0
weekday                  0
wday                     0
month                    0
year                     0
event_name_1       3445370
event_type_1       3445370
event_name_2       3811250
event_type_2       3811250
snap_CA                  0
snap_TX                  0
snap_WI                  0
sell_price             705
lag_28              853720
lag_7               213430
rolling_mean_28     853720
day_of_month             0
week_of_month            0
sales_28_sum        823230
dtype: int64

In [120]:
sales_long = sales_long.dropna(subset=['lag_28', 'lag_7', 'rolling_mean_28', 'sales_28_sum'])
print(sales_long.shape)
sales_long.head()


(2134300, 28)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_CA,snap_TX,snap_WI,sell_price,lag_28,lag_7,rolling_mean_28,day_of_month,week_of_month,sales_28_sum
855332,FOODS_1_001_CA_1_validation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1817,0,2016-01-19,11551,...,0,0,0,2.24,1.0,0.0,0.464286,19,3,23.0
885822,FOODS_1_001_CA_1_validation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1818,0,2016-01-20,11551,...,0,0,0,2.24,2.0,2.0,0.428571,20,3,23.0
916312,FOODS_1_001_CA_1_validation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1819,1,2016-01-21,11551,...,0,0,0,2.24,0.0,0.0,0.357143,21,3,24.0
946802,FOODS_1_001_CA_1_validation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1820,0,2016-01-22,11551,...,0,0,0,2.24,0.0,0.0,0.392857,22,4,23.0
977292,FOODS_1_001_CA_1_validation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1821,0,2016-01-23,11552,...,0,0,0,2.24,1.0,0.0,0.392857,23,4,24.0


In [121]:
sales_long.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_28', 'lag_7',
       'rolling_mean_28', 'day_of_month', 'week_of_month', 'sales_28_sum'],
      dtype='object')

In [122]:
final_columns = ['item_id', 'dept_id', 'store_id', 'state_id', 'weekday', 'month', 'week_of_month', 'event_name_1', 'event_type_1', 'event_name_2', 
                 'event_type_2','snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_28', 'lag_7', 'rolling_mean_28', 'sales_28_sum']

final_df = sales_long[final_columns]
final_df

Unnamed: 0,item_id,dept_id,store_id,state_id,weekday,month,week_of_month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_28,lag_7,rolling_mean_28,sales_28_sum
855332,FOODS_1_001,FOODS_1,CA_1,CA,Tuesday,1,3,,,,,0,0,0,2.24,1.0,0.0,0.464286,23.0
885822,FOODS_1_001,FOODS_1,CA_1,CA,Wednesday,1,3,,,,,0,0,0,2.24,2.0,2.0,0.428571,23.0
916312,FOODS_1_001,FOODS_1,CA_1,CA,Thursday,1,3,,,,,0,0,0,2.24,0.0,0.0,0.357143,24.0
946802,FOODS_1_001,FOODS_1,CA_1,CA,Friday,1,4,,,,,0,0,0,2.24,0.0,0.0,0.392857,23.0
977292,FOODS_1_001,FOODS_1,CA_1,CA,Saturday,1,4,,,,,0,0,0,2.24,1.0,0.0,0.392857,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2864622,HOUSEHOLD_2_516,HOUSEHOLD_2,WI_3,WI,Thursday,3,4,Purim End,Religious,,,0,0,0,5.94,0.0,0.0,0.107143,0.0
2895112,HOUSEHOLD_2_516,HOUSEHOLD_2,WI_3,WI,Friday,3,4,,,,,0,0,0,5.94,0.0,0.0,0.107143,0.0
2925602,HOUSEHOLD_2_516,HOUSEHOLD_2,WI_3,WI,Saturday,3,4,,,,,0,0,0,5.94,0.0,0.0,0.107143,0.0
2956092,HOUSEHOLD_2_516,HOUSEHOLD_2,WI_3,WI,Sunday,3,4,Easter,Cultural,,,0,0,0,5.94,1.0,0.0,0.107143,0.0


In [123]:
final_df.columns

Index(['item_id', 'dept_id', 'store_id', 'state_id', 'weekday', 'month',
       'week_of_month', 'event_name_1', 'event_type_1', 'event_name_2',
       'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_28',
       'lag_7', 'rolling_mean_28', 'sales_28_sum'],
      dtype='object')

### Data Transformation

In [124]:
## train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [125]:
## data transformation 
class PreProcessor:

    def __init__(self, dataframe):
        self.X = dataframe.drop('sales_28_sum', inplace = False, axis = 1)
        self.y = dataframe['sales_28_sum']
        self.encoders = {}
        self.scalers = None

    def get_train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)
        return X_train, X_test, y_train, y_test
    
    def get_nan_value_imputing(self, X_train, X_test):
        event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
        for col in event_cols:
            X_train[col] = X_train[col].fillna('No_event')
            X_test[col] = X_test[col].fillna('No_event')

        for df in [X_train, X_test]:
            df['sell_price'] = (
            df.groupby(['store_id', 'item_id'])['sell_price']
            .transform(lambda x: x.ffill().bfill())
            )
        return X_train, X_test
    
    def get_label_encoding(self, X_train, X_test):
        label_encoding_columns = ['item_id', 'dept_id', 'store_id', 'weekday', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
        for col in label_encoding_columns:
            label_encoder = LabelEncoder()
            X_train[col] = label_encoder.fit_transform(X_train[col])
            X_test[col] = X_test[col].map(lambda s: label_encoder.transform([s])[0] if str(s) in label_encoder.classes_ else -1)

            self.encoders[col] = label_encoder
        return X_train, X_test
    
    def get_scaled_data(self, X_train, X_test):
        numeric_cols = ['sell_price', 'lag_28', 'lag_7', 'rolling_mean_28']
        scaler = StandardScaler()
        X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
        X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
        self.scaler = scaler
        return X_train, X_test
    
    def begin_preprocessing(self):
        X_tr, X_te, y_tr, y_te = self.get_train_test_split()
        X_tr, X_te = self.get_nan_value_imputing(X_tr, X_te)
        X_tr, X_te = self.get_label_encoding(X_tr, X_te)
        X_tr, X_te = self.get_scaled_data(X_tr, X_te)
        return X_tr, X_te, y_tr, y_te 

    

In [126]:
preprocessor = PreProcessor(final_df)
X_train, X_test, y_train, y_test = preprocessor.begin_preprocessing()

In [127]:
X_train.columns

Index(['item_id', 'dept_id', 'store_id', 'state_id', 'weekday', 'month',
       'week_of_month', 'event_name_1', 'event_type_1', 'event_name_2',
       'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_28',
       'lag_7', 'rolling_mean_28'],
      dtype='object')

In [128]:
X_test.columns

Index(['item_id', 'dept_id', 'store_id', 'state_id', 'weekday', 'month',
       'week_of_month', 'event_name_1', 'event_type_1', 'event_name_2',
       'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_28',
       'lag_7', 'rolling_mean_28'],
      dtype='object')

In [129]:
print(X_train.isna().sum())
print(X_test.isna().sum())
print(y_train.isna().sum())
print(y_test.isna().sum())

item_id            0
dept_id            0
store_id           0
state_id           0
weekday            0
month              0
week_of_month      0
event_name_1       0
event_type_1       0
event_name_2       0
event_type_2       0
snap_CA            0
snap_TX            0
snap_WI            0
sell_price         0
lag_28             0
lag_7              0
rolling_mean_28    0
dtype: int64
item_id            0
dept_id            0
store_id           0
state_id           0
weekday            0
month              0
week_of_month      0
event_name_1       0
event_type_1       0
event_name_2       0
event_type_2       0
snap_CA            0
snap_TX            0
snap_WI            0
sell_price         0
lag_28             0
lag_7              0
rolling_mean_28    0
dtype: int64
0
0


In [130]:
from sklearn.metrics import r2_score

def evaluate_model(X_train, y_train, X_test, y_test, models):
    for i in range(len(list(models))):
        model = list(models.values())[i]

        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_model_score = r2_score(y_train, y_train_pred)
        test_model_score = r2_score(y_test, y_test_pred)

        report = {}

        report[list(models.keys())[i]] = test_model_score

    return report

In [131]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [132]:
# class ModelTrainer:
#     def __init__(self, X_train, y_train, X_test, y_test):
#         self.X_train = X_train
#         self.y_train = y_train
#         self.X_test = X_test
#         self.y_test = y_test
#         self.model = None
#         self.model_name = None

#     def get_array(self):
#         return (
#             np.array(self.X_train),
#             np.array(self.X_test),
#             np.array(self.y_train),
#             np.array(self.y_test)
#         )

#     def get_best_model(self):
#         X_train, X_test, y_train, y_test = self.get_array()
#         models = {
#             'random forest': RandomForestRegressor(),
#             'adaboost': AdaBoostRegressor(),
#             'gradient boost': GradientBoostingRegressor(),
#             'cat boost': CatBoostRegressor(verbose=False),
#             'xgboost': XGBRegressor()
#         }
#         model_report = evaluate_model(X_train, y_train, X_test, y_test, models)
#         best_model_score = max(model_report.values())
#         best_model_name = list(model_report.keys())[list(model_report.values()).index(best_model_score)]
#         best_model = models[best_model_name]

#         if best_model_score < 0.8:
#             print("No suitable model found.")
#         else:
#             print(f"Best model: {best_model_name}")

#         best_model.fit(X_train, y_train)
#         self.model = best_model
#         self.model_name = best_model_name

#     def get_model_accuracy(self):
#         X_train, X_test, y_train, y_test = self.get_array()
#         y_train_pred = self.model.predict(X_train)
#         y_test_pred = self.model.predict(X_test)

#         return {
#             'training_accuracy': r2_score(y_train, y_train_pred),
#             'test_accuracy': r2_score(y_test, y_test_pred)
#         }

#     def begin_model_training(self):
#         self.get_best_model()
#         model_accuracy = self.get_model_accuracy()
#         return model_accuracy, self.model_name


In [133]:
# model_trainer = ModelTrainer(X_train, y_train, X_test, y_test)
# model_trainer.begin_model_training()

In [140]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
import lightgbm as lgb
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Suppose X_train, X_test, y_train, y_test are already prepared
# Use your PreProcessor class as you did earlier.

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'device': 'gpu',                 # ✅ USE GPU
        'gpu_use_dp': False,             # faster on most GPUs
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_estimators': 10000
    }

    model = lgb.LGBMRegressor(**params)

    model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(0)  # disables printing
    ]
)

    preds = model.predict(X_test)
    preds = np.maximum(preds, 0)

    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)

    return rmse

# Optuna study with parallel jobs
study = optuna.create_study(
    direction='minimize',
    study_name='lgbm_gpu_study'
)

study.optimize(objective, n_trials=50, n_jobs=4)  # ✅ parallel trials

print("Best RMSE:", study.best_value)
print("Best parameters:", study.best_params)


[I 2025-07-02 23:23:13,326] A new study created in memory with name: lgbm_gpu_study


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


[W 2025-07-02 23:23:25,910] Trial 1 failed with parameters: {'learning_rate': 0.015526965272905651, 'num_leaves': 210, 'max_depth': 5, 'min_child_samples': 14, 'subsample': 0.6652129079081117, 'colsample_bytree': 0.5662058555776878, 'reg_alpha': 5.085062437832528e-07, 'reg_lambda': 2.6943412797410833} because of the following error: LightGBMError('Check failed: (best_split_info.left_count) > (0) at D:\\a\\1\\s\\lightgbm-python\\src\\treelearner\\serial_tree_learner.cpp, line 852 .\n').
Traceback (most recent call last):
  File "e:\environments\wallmart_hackathon\venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\mahic\AppData\Local\Temp\ipykernel_11160\536518749.py", line 31, in objective
    model.fit(
  File "e:\environments\wallmart_hackathon\venv\Lib\site-packages\lightgbm\sklearn.py", line 1398, in fit
    super().fit(
  File "e:\environments\wallmart_hackathon\venv\Lib\sit

Training until validation scores don't improve for 50 rounds


In [135]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 1707440, number of used features: 16
[LightGBM] [Info] Start training from score 38.491934




In [136]:
## root mean square log error

from sklearn.metrics import mean_squared_log_error
import numpy as np
y_pred_clipped = np.maximum(y_pred, 0)
y_test_clipped = np.maximum(y_test, 0)
rmsle = np.sqrt(mean_squared_log_error(y_test_clipped, y_pred_clipped))
print("RMSLE:", rmsle)

RMSLE: 0.7705896601009132


In [137]:
import numpy as np

def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)
    )

print("SMAPE:", smape(y_test, y_pred))

SMAPE: 50.48993594457012


In [138]:
from sklearn.metrics import r2_score

In [139]:
r2_score(y_test_clipped, y_pred_clipped)

0.9228666589939952