In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

from sktime.forecasting.compose import ForecastingPipeline, make_reduction
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.date import DateTimeFeatures

In [3]:
data = pd.read_csv("global_fc/train.csv")
data['date'] = pd.to_datetime(data['date'])

In [4]:
stores = [5, 10]
items = [40, 50]

In [5]:
#### Prepare data for Global Modeling ----

subset = data.query("store in @stores and item in @items")
subset['store_str'] = ['store_' + str(i) for i in subset['store']]
subset['item_str'] = ['item_' + str(i) for i in subset['item']]
subset['time_series'] = subset[['store_str', 'item_str']].apply(lambda x: '_'.join(x), axis=1)
subset.drop(columns=["store_str", "item_str"], inplace=True)

# subset.head()

y_global = subset[["time_series", "date", "sales"]]
X_global = subset.drop(columns=["sales"])

y_global.set_index(["time_series", "date"], inplace=True)
X_global.set_index(["time_series", "date"], inplace=True)

# y_global.info(), X_global.info()
# display(y_global.head()), display(X_global.head())

In [6]:
y_train_global, y_test_global, X_train_global, X_test_global = temporal_train_test_split(y_global, X_global)
# display(y_train_global.head(5), X_train_global.head(5))
# display(y_test_global.head(5), X_test_global.head(5))

In [7]:
FH = np.arange(1, len(y_test_global.loc["store_5_item_40"])+1)
# FH 

In [8]:
regressor = make_pipeline(
    RandomForestRegressor(random_state=1),
)

def count_gt130(x):
    """Count how many observations lie above threshold."""
    return np.sum((x > 700)[::-1])


kwargs = {
    "lag_feature": {
        "lag": [1],
        "mean": [[1, 3], [3, 6]],
        "std": [[1, 4]],
        count_gt130: [[2, 3]],
    }
}


steps=[
        ("daily_season", DateTimeFeatures(ts_freq="D")),  
        ("daily_season2", DateTimeFeatures(manual_selection=["week_of_month", "day_of_quarter"])),
]  

forecaster_global = make_reduction(
    regressor,
    scitype="tabular-regressor",
    transformers=[WindowSummarizer(**kwargs, n_jobs=1, truncate="bfill")],
    window_length=None,
    strategy="recursive",
)


# Version 2: Global Forecasting
pipe_global = ForecastingPipeline(steps= steps + [("forecaster", forecaster_global)])

In [9]:
_ = pipe_global.fit(y_train_global, X_train_global, fh=FH)

In [10]:
# TODO (check): Does not recognize freq in X (so how did train work?)
y_pred_global = pipe_global.predict(X=X_test_global)

ValueError: No `freq` information available