# Sanity check - Rolling method efficiency
Will test how well the agent preforms using different rolling methods to understand if exists any superior method.

## Final Report 

* The highest rewards are consistently when testing on rolling_median (column 2), regardless of training regime (all around -1036).

* When testing on rolling_weighted_mean or rolling_winsorized_mean, rewards are much lower (more negative), suggesting those regimes are harder or the agent doesn’t generalize as well to them.

* Training regime has less effect than testing regime—all training regimes seem to generalize similarly to testing on rolling_median.


* rolling_median is the easiest regime for the agent to perform on (highest rewards across all rows in that column).

* rolling_weighted_mean and rolling_winsorized_mean are harder regimes or less well captured by your agent (much lower rewards).

* Agents trained on any regime do about equally well when tested on the rolling_median regime.

In [1]:
"""
What to Look For
Greener cells = better agent performance

Diagonal: Generalization to same regime

Off-diagonal: Generalization across regimes
"""

'\nWhat to Look For\nGreener cells = better agent performance\n\nDiagonal: Generalization to same regime\n\nOff-diagonal: Generalization across regimes\n'

In [2]:
# Store Sales Forecasting - Real Dataset with RMSLE Optimization
import jupyter

import os
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt



from scipy.stats import mstats
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

from src.utils.system import boot,notify
from src.experiments.experiment_tracker import ExperimentTracker

boot()

experience_name = "sanity-check__evaluate_efficiency_of_averaging_policies"
target_date = "2025-06-01"
experiment_tracker = ExperimentTracker(experience_name)
config={
    "dataset":"kaggle-sales-prediction",
    "model": "xgb.XGBRegressor"
}

features = [
        'store_nbr', 'family', 'dayofweek', 'month', 'day', 'week', 'is_holiday',
        'transactions', 'lag_7', 'lag_14', 'trans_lag_7', 'trans_lag_14',
        'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
        'trans_roll_mean_7', 'trans_roll_mean_14',
        'onpromo_lag_7', 'onpromo_mean_14'
]

run_settings={
    "n_estimators":750, 
    "learning_rate":0.05, 
    "max_depth":6, 
    "random_state":42,
    "regime": "rolling_mean",
    "features": features.copy()
}
run_settings["features"].sort()


#def save_run(
#        self,
#        config: Dict,
#        results: Dict,
#        target_date: str,
#        run_settings: Dict,
#        files: Optional[Dict] = None
#    ) -> None:
#        """

  from pandas.core import (


In [None]:
# Store Sales Forecasting - Real Dataset with RMSLE Optimization
import jupyter
import os
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt




from scipy.stats import mstats
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

from src.utils.system import boot,notify


boot()


class ExecutionTimeTracker:
    def __init__(self,name):
        self.name = name
        self.start = time.time()
        self.end = None
        self.signature = f"[ExecutionTimeTracker]::{self.name} -"
        print(f"{self.signature} Start.")

    def done(self):
        if self.end == None:
            self.end =time.time()
            self.duration = self.end -self.start
            
        print(f"{self.signature} Complete. Took {int(self.end-self.start)}s to complete.")
        print('')
        
# REGIME FUNCTIONS =================================================
def rolling_mean(x):
    if len(x) == 0 or np.all(np.isnan(x)):
        return np.nan
    return np.nanmean(x)

def rolling_median(x):
    if len(x) == 0 or np.all(np.isnan(x)):
        return np.nan
    return np.nanmedian(x)

def rolling_weighted_mean(x): 
    if len(x) == 0 or np.all(np.isnan(x)):
        return np.nan
    weights = np.arange(1, len(x)+1)
    valid = ~np.isnan(x)
    if np.sum(valid) == 0:
        return np.nan
    return np.average(x[valid], weights=weights[valid])

def rolling_winsorized_mean(x):
    if len(x) < 2 or np.all(np.isnan(x)):
        return np.nan
    x_clean = x[~np.isnan(x)]
    if len(x_clean) < 2:
        return np.nan
    return mstats.winsorize(x_clean, limits=(0.1, 0.1)).mean()

# REGIME FUNCTIONS REGISTRY =======================================
regimes = {
    'rolling_mean': rolling_mean,
    'rolling_median': rolling_median,
    'rolling_weighted_mean': rolling_weighted_mean,
    'rolling_winsorized_mean': rolling_winsorized_mean,
}

# Feature engineering pipeline with regime-based rolling =========
def add_lag_rolling(df, train_func, train_name, lag_days=[7, 14], rolling_windows=[7, 14],mode="train"):
    FOLDER = 'data/experiments/kaggle-sales-prediction/'
    file_path = os.path.join(FOLDER, train_name+'__'+mode+'.csv')
    exists = os.path.exists(file_path)
    
    if exists:
        cached_df = pd.read_csv(file_path, parse_dates=['date'])
        if not cached_df.empty:
            print(f"Loaded cached features from {file_path}")
            return cached_df
    
    df = df.sort_values(['store_nbr', 'family', 'date'])

    for lag in [7, 14]:
        df[f'lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
        df[f'trans_lag_{lag}'] = df.groupby(['store_nbr'])['transactions'].shift(lag)

    for window in [7, 14]:
        df[f'rolling_mean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.shift(1).rolling(window, min_periods=1).apply(train_func, raw=True))#.shift(1)#.rolling(window, min_periods=1).mean()
        df[f'rolling_std_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window, min_periods=1).std()
        df[f'trans_roll_mean_{window}'] = df.groupby(['store_nbr'])['transactions'].transform(lambda x: x.shift(1).rolling(window, min_periods=1).apply(train_func, raw=True))#.shift(1)#.rolling(window, min_periods=1).mean()

    df['onpromo_lag_7'] = df.groupby(['store_nbr', 'family'])['onpromotion'].shift(7)
    df['onpromo_mean_14'] = df.groupby(['store_nbr', 'family'])['onpromotion'].transform(lambda x: x.shift(1).rolling(14, min_periods=1).apply(train_func, raw=True))#.shift(1)#.rolling(14, min_periods=1).mean()

    df.to_csv(file_path, index=False)
    return df

# GENERATE BASE TRAIN/TEST DATASET ======================================


def preprocess_dataset():
    FOLDER = 'data/experiments/kaggle-sales-prediction/'
    _train = pd.read_csv(FOLDER+'train.csv', parse_dates=['date'])
    _test = pd.read_csv(FOLDER+'test.csv', parse_dates=['date'])
    _stores = pd.read_csv(FOLDER+'stores.csv')
    _holidays = pd.read_csv(FOLDER+'holidays_events.csv', parse_dates=['date'])
    _transactions = pd.read_csv(FOLDER+'transactions.csv', parse_dates=['date'])
    _df = pd.concat([_train,_test]).sort_values(by="date")
    _separator = _test.iloc[0]['date']
    def preprocess(df,stores,holidays,transactions):
        train = df
        
        # 2. Merge External Features
        train = train.merge(stores, on='store_nbr', how='left')
        train = train.merge(transactions, on=['date', 'store_nbr'], how='left')

        holidays = holidays[(holidays['locale'] == 'National') & (holidays['transferred'] == False)]
        holidays = holidays[['date']].drop_duplicates()
        holidays['is_holiday'] = 1
        train = train.merge(holidays, on='date', how='left')
        train['is_holiday'] = train['is_holiday'].fillna(0)

        # Merge onpromotion before encoding 'family'
        #train = train.merge(test[['date', 'store_nbr', 'family', 'onpromotion']],
        #                    on=['date', 'store_nbr', 'family'], how='left')

        # Clean up column names
        if 'onpromotion_x' in train.columns:
            train['onpromotion'] = train['onpromotion_x'].fillna(0).astype(int)
            train.drop(columns=['onpromotion_x', 'onpromotion_'], errors='ignore', inplace=True)
        else:
            train['onpromotion'] = train['onpromotion'].fillna(0).astype(int)

        # Now encode 'family'
        train['family'] = train['family'].astype('category').cat.codes
    
        train['dayofweek'] = train['date'].dt.dayofweek
        train['month'] = train['date'].dt.month
        train['day'] = train['date'].dt.day
        train['week'] = train['date'].dt.isocalendar().week.astype(int)
        train['transactions'] = train['transactions'].fillna(0)

        # 4. Lag and Rolling Features
        train = train.sort_values(['store_nbr', 'family', 'date'])
        return train
    
    processed =preprocess(_df.copy(),_stores.copy(),_holidays.copy(),_transactions.copy())
    train = processed[processed['date'] < _separator]
    test = processed[processed['date'] >= _separator]
                      #test  =preprocess(_test.copy(),_stores.copy(),_holidays.copy(),_transactions.copy())
    
    return train,test

def preprocess_feature_dataset(dataset,train_func,train_name,mode):
   
    df=add_lag_rolling(dataset.copy(),train_func,train_name,mode=mode)
    for col in ['lag_7', 'lag_14', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14']:
        df[col] = np.log1p(df[col])
    for col in ['trans_lag_7', 'trans_lag_14', 'trans_roll_mean_7', 'trans_roll_mean_14']:
        df[col] = np.log1p(df[col] + 1e-5)
    features = [
        'store_nbr', 'family', 'dayofweek', 'month', 'day', 'week', 'is_holiday',
        'transactions', 'lag_7', 'lag_14', 'trans_lag_7', 'trans_lag_14',
        'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
        'trans_roll_mean_7', 'trans_roll_mean_14',

        #'store_dow', 
        #'family_month', 
        'onpromo_lag_7', 'onpromo_mean_14'
    ]
                             
    
    df = df[df[features].notna().all(axis=1)].copy()     
                             
 
    df[features].dropna(inplace=True)
                      
    X = df[features]
    y = np.log1p(df['sales'])
    if mode =="train":
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)
        return X_train,X_val,y_train,y_val
    else:
        return X,X,y,y
    
# START EXPERIENCE =====================================================
results = {}
preprocess_execution_tracker = ExecutionTimeTracker('preprocess_dataset')
train_df,test_df = preprocess_dataset()
preprocess_execution_tracker.done()                         
                             
for train_name, train_func in regimes.items():
    
    run_settings['regime'] = train_name
    if experiment_tracker.did_run(config,target_date,run_settings):
        continue
    else:
   
        print(train_name)
        # 1. Load Dataset

        print('Preprocessing features')
        preprocess_features_execution_tracker = ExecutionTimeTracker('preprocess_feature_dataset')  

        X_train,X_val,y_train,y_val= preprocess_feature_dataset(train_df.copy(),train_func,train_name,"train")

        _test_val_separator = int(len(X_val)/2)

        X_val_full = X_val.copy()
        y_val_full = y_val.copy()

        X_val = X_val_full.iloc[:_test_val_separator]
        X_test = X_val_full.iloc[_test_val_separator:]
    
        y_val = y_val_full.iloc[:_test_val_separator]
        y_test = y_val_full.iloc[_test_val_separator:]


        preprocess_features_execution_tracker.done()

        xgb_train_execution_tracker = ExecutionTimeTracker('xgb_train__1000')   

        model = xgb.XGBRegressor(
            n_estimators=run_settings["n_estimators"], 
            learning_rate=run_settings["learning_rate"], 
            max_depth=run_settings["max_depth"], 
            random_state=run_settings["random_state"])

        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],

                  verbose=25)

        xgb_train_execution_tracker.done()
        y_pred = np.expm1(model.predict(X_train))
        y_real = np.expm1(y_train)
        rmsle_train = np.sqrt(mean_squared_log_error(y_real, y_pred))

        # 6. Evaluation
        y_pred = np.expm1(model.predict(X_val))
        y_real = np.expm1(y_val)

        rmsle_val = np.sqrt(mean_squared_log_error(y_real, y_pred))

        y_pred = np.expm1(model.predict(X_test))
        y_real = np.expm1(y_test)

        rmsle_test = np.sqrt(mean_squared_log_error(y_real, y_pred))



        #results[train_name]=[rmsle]
        results[train_name]=[rmsle_train,rmsle_val,rmsle_test]

 
        experiment_tracker.save_run(
            config,
            {"train":rmsle_train,"val":rmsle_val,"test":rmsle_test},
            target_date,
            run_settings
        )
        print(f"RMSLE:{rmsle_train:.4f}, {rmsle_val:.4f},{rmsle_test:.4f}")


[ExecutionTimeTracker]::preprocess_dataset - Start.
[ExecutionTimeTracker]::preprocess_dataset - Complete. Took 5s to complete.

rolling_mean
Preprocessing features
[ExecutionTimeTracker]::preprocess_feature_dataset - Start.
Loaded cached features from data/experiments/kaggle-sales-prediction/rolling_mean__train.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features].dropna(inplace=True)


[ExecutionTimeTracker]::preprocess_feature_dataset - Complete. Took 13s to complete.

[ExecutionTimeTracker]::xgb_train__1000 - Start.
[0]	validation_0-rmse:2.94891
[25]	validation_0-rmse:0.96020
[50]	validation_0-rmse:0.53333
[75]	validation_0-rmse:0.47142
[100]	validation_0-rmse:0.45699
[125]	validation_0-rmse:0.44896
[150]	validation_0-rmse:0.44246
[175]	validation_0-rmse:0.43695
[200]	validation_0-rmse:0.43176
[225]	validation_0-rmse:0.42857
[250]	validation_0-rmse:0.42601
[275]	validation_0-rmse:0.42276
[300]	validation_0-rmse:0.42005
[325]	validation_0-rmse:0.41717
[350]	validation_0-rmse:0.41539
[375]	validation_0-rmse:0.41239
[400]	validation_0-rmse:0.41055
[425]	validation_0-rmse:0.40869
[450]	validation_0-rmse:0.40693
[475]	validation_0-rmse:0.40552
[500]	validation_0-rmse:0.40401
[525]	validation_0-rmse:0.40276
[550]	validation_0-rmse:0.40186
[575]	validation_0-rmse:0.40095
[600]	validation_0-rmse:0.40049
[625]	validation_0-rmse:0.39984
[650]	validation_0-rmse:0.39931
[675]	

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features].dropna(inplace=True)


[ExecutionTimeTracker]::preprocess_feature_dataset - Complete. Took 14s to complete.

[ExecutionTimeTracker]::xgb_train__1000 - Start.
[0]	validation_0-rmse:2.94880
[25]	validation_0-rmse:0.95748
[50]	validation_0-rmse:0.52200
[75]	validation_0-rmse:0.46010
[100]	validation_0-rmse:0.44767
[125]	validation_0-rmse:0.44123
[150]	validation_0-rmse:0.43642
[175]	validation_0-rmse:0.43209
[200]	validation_0-rmse:0.42813
[225]	validation_0-rmse:0.42440
[250]	validation_0-rmse:0.42149
[275]	validation_0-rmse:0.41924
[300]	validation_0-rmse:0.41721
[325]	validation_0-rmse:0.41485
[350]	validation_0-rmse:0.41463
[375]	validation_0-rmse:0.41274
[400]	validation_0-rmse:0.41045
[425]	validation_0-rmse:0.40945
[450]	validation_0-rmse:0.40896
[475]	validation_0-rmse:0.40843
[500]	validation_0-rmse:0.40759
[525]	validation_0-rmse:0.40671
[550]	validation_0-rmse:0.40678
[575]	validation_0-rmse:0.40693
[600]	validation_0-rmse:0.40632
[625]	validation_0-rmse:0.40648
[650]	validation_0-rmse:0.40583
[675]	

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features].dropna(inplace=True)


[ExecutionTimeTracker]::preprocess_feature_dataset - Complete. Took 611s to complete.

[ExecutionTimeTracker]::xgb_train__1000 - Start.
[0]	validation_0-rmse:2.94881
[25]	validation_0-rmse:0.95645
[50]	validation_0-rmse:0.52083
[75]	validation_0-rmse:0.45934
[100]	validation_0-rmse:0.44571
[125]	validation_0-rmse:0.43731
[150]	validation_0-rmse:0.43127
[175]	validation_0-rmse:0.42606
[200]	validation_0-rmse:0.42172
[225]	validation_0-rmse:0.41874
[250]	validation_0-rmse:0.41564
[275]	validation_0-rmse:0.41280
[300]	validation_0-rmse:0.41015
[325]	validation_0-rmse:0.40822
[350]	validation_0-rmse:0.40573
[375]	validation_0-rmse:0.40391
[400]	validation_0-rmse:0.40260
[425]	validation_0-rmse:0.40098
[450]	validation_0-rmse:0.39997
[475]	validation_0-rmse:0.39882
[500]	validation_0-rmse:0.39736
[525]	validation_0-rmse:0.39657
[550]	validation_0-rmse:0.39571
[575]	validation_0-rmse:0.39463
[600]	validation_0-rmse:0.39372
[625]	validation_0-rmse:0.39260
[650]	validation_0-rmse:0.39174
[675]

### What’s a Good Score?
For Store Sales (Kaggle):

🥉 RMSLE > 0.60 → Baseline or simple model

🥈 RMSLE ≈ 0.45 → Reasonable with lags + rolling + calendar features

🥇 RMSLE < 0.40 → Competitive (you’re doing very well)

🏆 RMSLE < 0.38 → Likely leaderboard top 10%



In [None]:
results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,6))

sns.heatmap(
    pd.DataFrame(results),
    #er=0,
    
    annot=True, fmt=".1f", cmap="RdYlGn", 
    linewidths=0.5, linecolor='black'
           )
#plt.title("Train/Test Regime Mean Total Reward\n(rows=train, cols=test)")
plt.xlabel("Test Regime")
plt.ylabel("Train Regime")
plt.show()


In [None]:
pd.DataFrame(results)