# Headers and MLFlow

In [1]:
import pandas as pd
from main import *

# Data

In [10]:
df_main = pd.read_csv('/home/azureuser/cloudfiles/code/Users/vlavanga/data/processed/snowflake_v5_pre_ts.csv')
df_main = df_main.drop(columns=['SALES_ORG','COUNTRY','FABRIC_CONTENT_CODE_TEXT'])
df_main = df_main[df_main['SILHOUETTE'].isin(['5950','LP5950', '950', '3930','940','920'])]

In [11]:
df_main.head()

Unnamed: 0,PO_CREATED_DATE,REGION,SALES_ORG_NAME,FABRIC_TYPE,TEAM,SILHOUETTE,SPORT,DIVISION_NAME,SEASON_CONSOLIDATION,ORDERED_QUANTITY
0,20231003,Japan,Japan,Wovens,YOMIURI GIANTS,LP5950,BASEBALL,Headwear,Custom,72
1,20231003,Japan,Japan,Wovens,YOMIURI GIANTS,LP5950,BASEBALL,Headwear,Custom,72
2,20231109,Emerging Markets,New Zealand,Wovens,LAS VEGAS RAIDERS,940,FOOTBALL,Headwear,Program,80
3,20231103,North America,United States,Wovens,RANCHO CUCAMONGA QUAKES,5950,BASEBALL,Headwear,Program,1
4,20231103,North America,United States,Wovens,RENO ACES,5950,BASEBALL,Headwear,Program,1


In [12]:
# Step 1: Convert to string
df_main["PO_CREATED_DATE"] = df_main["PO_CREATED_DATE"].astype(str)

# Step 2: Convert to datetime
df_main["PO_CREATED_DATE"] = pd.to_datetime(
    df_main["PO_CREATED_DATE"],
    format="%Y%m%d",
    errors="coerce"
)

# Step 3: Check if any NaT created
print("NaT count after conversion:", df_main["PO_CREATED_DATE"].isna().sum())

# Step 4: Filter from 2009 onwards
df_main = df_main[df_main["PO_CREATED_DATE"] >= "2009-01-01"]

NaT count after conversion: 0


In [13]:
df_main['TEAM'] = df_main['TEAM'].fillna('Unknown')
df_main['SILHOUETTE'] = df_main['SILHOUETTE'].fillna('Unknown')
df_main['SPORT'] = df_main['SPORT'].fillna('Unknown')
df_main['SALES_ORG_NAME'] = df_main['SALES_ORG_NAME'].fillna('Unknown')

# Convert date â†’ MonthTimestamp 
df_main['DATE'] = df_main['PO_CREATED_DATE'].dt.to_period('M').dt.to_timestamp()

# Sort for proper lag creation
df_main = df_main.sort_values('DATE')
df_main.drop(['PO_CREATED_DATE'], axis=1, inplace=True)
df = df_main.copy()

In [14]:
df['SPORT'].unique()

array(['NONE', 'BASEBALL', 'COLLEGE', 'GOLF', 'SOCCER', 'HOCKEY',
       'BASKETBALL', 'ENTERTAINMENT', 'FOOTBALL', 'RACING',
       'COBRANDED CORE', 'RUGBY', 'CRICKET', 'LACROSSE',
       'COBRANDED BASEBALL', 'COBRANDED RACING', 'ANIMATED CHARACTER',
       'COBRANDED FOOTBALL', 'COBRANDED BASKETBALL', 'WATER SPORTS',
       'CYCLING', 'COBRANDED COLLEGE', 'VOLLEYBALL', 'BOXING',
       'WRESTLING', 'HANDBALL', 'TENNIS', 'E SPORT', 'GAELIC FOOTBALL',
       'COBRANDED NON LICENSED', 'SOFTBALL', 'COBRANDED HOCKEY',
       'COBRANDED SOCCER', 'SNOWSPORTS'], dtype=object)

In [15]:
# Inspect unique values for all relevant categorical columns
cols_to_check = [
    "REGION",
    "SALES_ORG_NAME",
    "FABRIC_TYPE",
    "TEAM",
    "SILHOUETTE",
    "SPORT",
    "DIVISION_NAME",
    "SEASON_CONSOLIDATION"
]

for col in cols_to_check:
    print(f"\n================= {col} =================")
    print("Unique Count:", df_main[col].nunique())
    print(df_main[col].unique())



Unique Count: 4
['North America' 'EMEA' 'Emerging Markets' 'Japan']

Unique Count: 14
['United States' 'EMEA' 'Canada' 'Latin America' 'Southeast Asia'
 'Australia' 'South Korea' 'Mexico' 'German' 'New Zealand' 'Japan' 'China'
 'NEC China Shanghai' 'US Retail']

Unique Count: 4
['Wovens' 'Other' 'Knits' 'Polyester']

Unique Count: 3605
['NONE' 'NEW YORK YANKEES' 'WBC CANADA' ... 'MENTOS'
 'NHL WINTER CLASSICS LOGO' 'WHITE MOUNTAINEERING']

Unique Count: 6
['5950' '940' '3930' '920' 'LP5950' '950']

Unique Count: 34
['NONE' 'BASEBALL' 'COLLEGE' 'GOLF' 'SOCCER' 'HOCKEY' 'BASKETBALL'
 'ENTERTAINMENT' 'FOOTBALL' 'RACING' 'COBRANDED CORE' 'RUGBY' 'CRICKET'
 'LACROSSE' 'COBRANDED BASEBALL' 'COBRANDED RACING' 'ANIMATED CHARACTER'
 'COBRANDED FOOTBALL' 'COBRANDED BASKETBALL' 'WATER SPORTS' 'CYCLING'
 'COBRANDED COLLEGE' 'VOLLEYBALL' 'BOXING' 'WRESTLING' 'HANDBALL' 'TENNIS'
 'E SPORT' 'GAELIC FOOTBALL' 'COBRANDED NON LICENSED' 'SOFTBALL'
 'COBRANDED HOCKEY' 'COBRANDED SOCCER' 'SNOWSPORTS']

Un

In [17]:
# Final grouping keys (define the time series)
group_keys = ["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION", "DATE"]

df_grouped = (
    df.groupby(group_keys)
        .agg(
            ORDERED_QUANTITY=('ORDERED_QUANTITY', 'sum'),

            # Aggregated features we want for TFT
            TEAM_COUNT    = ('TEAM',    pd.Series.nunique),
            SPORT_COUNT   = ('SPORT',   pd.Series.nunique),
            FABRIC_COUNT  = ('FABRIC_TYPE', pd.Series.nunique),
            DIVISION_COUNT= ('DIVISION_NAME', pd.Series.nunique),

            # First-category assignment within that month (stable)
            REGION        = ('REGION', lambda x: x.iloc[0]),
            TEAM          = ('TEAM',   lambda x: x.iloc[0]),
            SPORT         = ('SPORT',  lambda x: x.iloc[0]),
            FABRIC_TYPE   = ('FABRIC_TYPE', lambda x: x.iloc[0]),
            DIVISION_NAME = ('DIVISION_NAME', lambda x: x.iloc[0])
        )
        .reset_index()
)

print("Grouped monthly shape:", df_grouped.shape)


Grouped monthly shape: (16188, 14)


In [21]:
# select only the desired columns (use cols_to_check and keep ORDERED_QUANTITY & DATE if present)
selected_cols = [c for c in cols_to_check if c in df.columns]
for extra in ["ORDERED_QUANTITY", "DATE"]:
    if extra in df.columns and extra not in selected_cols:
        selected_cols.append(extra)

df_selected = df[selected_cols].copy()

print("Selected columns:", selected_cols)
print("df_selected shape:", df_selected.shape)
df_selected.head()

Selected columns: ['REGION', 'SALES_ORG_NAME', 'FABRIC_TYPE', 'TEAM', 'SILHOUETTE', 'SPORT', 'DIVISION_NAME', 'SEASON_CONSOLIDATION', 'ORDERED_QUANTITY', 'DATE']
df_selected shape: (6076376, 10)


Unnamed: 0,REGION,SALES_ORG_NAME,FABRIC_TYPE,TEAM,SILHOUETTE,SPORT,DIVISION_NAME,SEASON_CONSOLIDATION,ORDERED_QUANTITY,DATE
614819,North America,United States,Wovens,NONE,5950,NONE,Headwear,Custom,1028,2009-01-01
1059455,North America,United States,Wovens,NEW YORK YANKEES,5950,BASEBALL,Headwear,Custom,30,2009-01-01
4665257,North America,United States,Wovens,NEW YORK YANKEES,5950,BASEBALL,Headwear,Custom,30,2009-01-01
2743571,North America,United States,Wovens,WBC CANADA,940,BASEBALL,Headwear,Custom,30,2009-01-01
1871709,North America,United States,Wovens,HOUSTON ASTROS,5950,BASEBALL,Headwear,Custom,30,2009-01-01


In [None]:
df_selected = df_selected.drop(columns=['REGION',])

In [23]:
cat_cols = ["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION", "DATE"]
except_cols = ['REGION','TEAM','SPORT','DIVISION_NAME']
missing_dates_df_final, pairs_final = verify_grp_timestamps(df_selected,cat_cols,except_cols)

In [26]:
missing_dates_df_final

Unnamed: 0,Group,Year,Missing Months
0,"(Australia, 3930, Custom, 2011-08-01 00:00:00)",2011,"[1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]"
1,"(Australia, 3930, Custom, 2011-09-01 00:00:00)",2011,"[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]"
2,"(Australia, 3930, Custom, 2011-10-01 00:00:00)",2011,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12]"
3,"(Australia, 3930, Custom, 2011-11-01 00:00:00)",2011,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]"
4,"(Australia, 3930, Custom, 2012-05-01 00:00:00)",2012,"[1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12]"
...,...,...,...
16183,"(United States, LP5950, Stock, 2025-01-01 00:0...",2025,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
16184,"(United States, LP5950, Stock, 2025-04-01 00:0...",2025,"[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12]"
16185,"(United States, LP5950, Stock, 2025-05-01 00:0...",2025,"[1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12]"
16186,"(United States, LP5950, Stock, 2025-06-01 00:0...",2025,"[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12]"


In [58]:
def fill_missing_months(df, keys):
    out = []
    for name, g in df.groupby(keys):

        g = g.sort_values("MONTH")

        full_range = pd.date_range(
            start=g["MONTH"].min(),
            end=g["MONTH"].max(),
            freq="MS"
        )

        g2 = g.set_index("MONTH").reindex(full_range)
        g2.index.name = "MONTH"

        # Numerical fill
        num_cols = ["ORDERED_QUANTITY","TEAM_COUNT","SPORT_COUNT","FABRIC_COUNT","DIVISION_COUNT"]
        g2[num_cols] = g2[num_cols].fillna(0)

        # Categorical forward-fill
        cat_cols = ["REGION","TEAM","SPORT","FABRIC_TYPE","DIVISION_NAME"]
        for c in cat_cols:
            g2[c] = g2[c].fillna(method="ffill").fillna(method="bfill")

        # Restore group_id values
        for i, k in enumerate(keys):
            g2[k] = name[i]

        out.append(g2.reset_index())

    return pd.concat(out, ignore_index=True)

df_filled = fill_missing_months(df_grouped, 
                                keys=["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION"])

print("After month filling:", df_filled.shape)


After month filling: (27668, 14)


In [59]:
df_filled['year'] = df_filled['MONTH'].dt.year
df_filled['month_num'] = df_filled['MONTH'].dt.month
df_filled['quarter'] = df_filled['MONTH'].dt.quarter


In [60]:
test_start = pd.Timestamp("2024-01-01")

train_df = df_filled[df_filled["MONTH"] < test_start].copy()
test_df  = df_filled[df_filled["MONTH"] >= test_start].copy()

print("Train has time_idx:", "time_idx" in train_df.columns)
print("Test has time_idx:", "time_idx" in test_df.columns)


Train has time_idx: False
Test has time_idx: False


# Model TFT

In [91]:
import pandas as pd
import numpy as np
import random
import torch
import pytorch_lightning as pl

from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from torch.utils.data import DataLoader

def generate_and_set_seed(seed=None):
    if seed is None:
        seed = random.randint(0, 2**32 - 1)

    print(f"\nðŸ”’ Training with SEED: {seed}\n")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    pl.seed_everything(seed, workers=True)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    return seed


df_main = pd.read_csv(
    "/home/azureuser/cloudfiles/code/Users/vlavanga/data/processed/snowflake_v5_pre_ts.csv",
)

df_main = df_main.drop(columns=['SALES_ORG','COUNTRY','FABRIC_CONTENT_CODE_TEXT'], errors='ignore')

valid_sil = ['5950','LP5950', '950', '3930','940','920']
df_main = df_main[df_main['SILHOUETTE'].isin(valid_sil)]


fill_cols = ["TEAM", "SPORT", "SILHOUETTE", "SALES_ORG_NAME", 
             "REGION", "DIVISION_NAME", "FABRIC_TYPE"]

for col in fill_cols:
    df_main[col] = df_main[col].fillna("Unknown")


In [97]:
df_main['PO_CREATED_DATE'] = pd.to_datetime(df_main['PO_CREATED_DATE'], format='%Y%m%d')

df_main["MONTH"] = df_main["PO_CREATED_DATE"].dt.to_period("M").dt.to_timestamp()
df_main = df_main.drop(columns=["PO_CREATED_DATE"], errors="ignore")

In [98]:
df_main.head()

Unnamed: 0,REGION,SALES_ORG_NAME,FABRIC_TYPE,TEAM,SILHOUETTE,SPORT,DIVISION_NAME,SEASON_CONSOLIDATION,ORDERED_QUANTITY,MONTH
0,Japan,Japan,Wovens,YOMIURI GIANTS,LP5950,BASEBALL,Headwear,Custom,72,2023-10-01
1,Japan,Japan,Wovens,YOMIURI GIANTS,LP5950,BASEBALL,Headwear,Custom,72,2023-10-01
2,Emerging Markets,New Zealand,Wovens,LAS VEGAS RAIDERS,940,FOOTBALL,Headwear,Program,80,2023-11-01
3,North America,United States,Wovens,RANCHO CUCAMONGA QUAKES,5950,BASEBALL,Headwear,Program,1,2023-11-01
4,North America,United States,Wovens,RENO ACES,5950,BASEBALL,Headwear,Program,1,2023-11-01


In [99]:
group_keys = ["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION", "MONTH"]

df_grouped = (
    df_main.groupby(group_keys)
        .agg(
            ORDERED_QUANTITY=('ORDERED_QUANTITY', 'sum'),

            TEAM_COUNT=('TEAM', pd.Series.nunique),
            SPORT_COUNT=('SPORT', pd.Series.nunique),
            FABRIC_COUNT=('FABRIC_TYPE', pd.Series.nunique),
            DIVISION_COUNT=('DIVISION_NAME', pd.Series.nunique),

            REGION=('REGION', lambda x: x.iloc[0]),
            TEAM=('TEAM', lambda x: x.iloc[0]),
            SPORT=('SPORT', lambda x: x.iloc[0]),
            FABRIC_TYPE=('FABRIC_TYPE', lambda x: x.iloc[0]),
            DIVISION_NAME=('DIVISION_NAME', lambda x: x.iloc[0]),
        )
        .reset_index()
)

print("Grouped:", df_grouped.shape)


Grouped: (16220, 14)


In [100]:
def fill_missing_months(df, keys):
    out = []

    for name, g in df.groupby(keys):
        g = g.sort_values("MONTH")

        full_range = pd.date_range(
            start=g["MONTH"].min(),
            end=g["MONTH"].max(),
            freq="MS"
        )

        g2 = g.set_index("MONTH").reindex(full_range)
        g2.index.name = "MONTH"

        num_cols = ["ORDERED_QUANTITY","TEAM_COUNT","SPORT_COUNT","FABRIC_COUNT","DIVISION_COUNT"]
        g2[num_cols] = g2[num_cols].fillna(0)

        cat_cols = ["REGION","TEAM","SPORT","FABRIC_TYPE","DIVISION_NAME"]
        for c in cat_cols:
            g2[c] = g2[c].fillna(method="ffill").fillna(method="bfill")

        for i, k in enumerate(keys):
            g2[k] = name[i]

        out.append(g2.reset_index())

    return pd.concat(out, ignore_index=True)

df_filled = fill_missing_months(
    df_grouped,
    keys=["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION"]
)

print("After fill:", df_filled.shape)


After fill: (27707, 14)


In [101]:
df_filled["year"] = df_filled["MONTH"].dt.year
df_filled["month_num"] = df_filled["MONTH"].dt.month
df_filled["quarter"] = df_filled["MONTH"].dt.quarter


In [102]:
df_filled = df_filled.sort_values(["SALES_ORG_NAME","SILHOUETTE","SEASON_CONSOLIDATION","MONTH"])

df_filled["time_idx"] = (
    df_filled["MONTH"].dt.year * 12 + df_filled["MONTH"].dt.month
)

df_filled["time_idx"] = df_filled["time_idx"] - df_filled["time_idx"].min()


In [103]:
categorical_cols = [
    "SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION",
    "REGION", "SPORT", "TEAM", "FABRIC_TYPE", "DIVISION_NAME"
]

for c in categorical_cols:
    df_filled[c] = df_filled[c].astype("category")


In [104]:
test_start = pd.Timestamp("2024-01-01")

train_df = df_filled[df_filled["MONTH"] < test_start].copy()
test_df  = df_filled[df_filled["MONTH"] >= test_start].copy()

print(train_df.shape, test_df.shape)


(24143, 18) (3564, 18)


In [105]:
group_cols = ["SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION"]

static_categoricals = group_cols

time_varying_known_categoricals = ["REGION","FABRIC_TYPE","SPORT","TEAM","DIVISION_NAME"]
time_varying_known_reals = ["month_num", "quarter"]

time_varying_unknown_reals = [
    "ORDERED_QUANTITY",
    "TEAM_COUNT","SPORT_COUNT","FABRIC_COUNT","DIVISION_COUNT"
]

target = "ORDERED_QUANTITY"


In [106]:
check = (
    df_filled.groupby(["SALES_ORG_NAME","SILHOUETTE","SEASON_CONSOLIDATION"])
    .agg(
        start=("time_idx","min"),
        end=("time_idx","max"),
        length=("time_idx", lambda x: x.max() - x.min() + 1)
    )
    .reset_index()
    .sort_values("length")
)

print(check.head(15))  # the shortest series
print("\nMinimum length across all series:", check["length"].min())
print("Series with < 48 months:", (check["length"] < 48).sum())


         SALES_ORG_NAME SILHOUETTE SEASON_CONSOLIDATION  start    end  length
188         South Korea        920                Stock   95.0   95.0     1.0
17            Australia     LP5950                Stock   87.0   87.0     1.0
87               German     LP5950               Custom  180.0  180.0     1.0
159  NEC China Shanghai     LP5950               Custom  181.0  181.0     1.0
231           US Retail     LP5950               Custom  133.0  136.0     4.0
143              Mexico     LP5950                Stock   95.0  100.0     6.0
216           US Retail       3930               Custom  133.0  139.0     7.0
37                China       3930              Program   90.0   99.0    10.0
92                Japan       3930                Stock  194.0  206.0    13.0
83               German        940                Stock  193.0  205.0    13.0
76               German       5950              Program  191.0  205.0    15.0
50                China        950                Stock   96.0  

In [107]:
# Remove all groups shorter than 48 months
valid_groups = check[check["length"] >= 48][
    ["SALES_ORG_NAME","SILHOUETTE","SEASON_CONSOLIDATION"]
]

df_filtered = df_filled.merge(valid_groups, on=["SALES_ORG_NAME","SILHOUETTE","SEASON_CONSOLIDATION"], how="inner")

print("Remaining rows:", df_filtered.shape)
print("Remaining unique series:", df_filtered.groupby(["SALES_ORG_NAME","SILHOUETTE","SEASON_CONSOLIDATION"]).ngroups)


Remaining rows: (27099, 18)
Remaining unique series: 186


In [108]:
max_encoder_length = 36
max_prediction_length = 12

training_dataset = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target=target,
    group_ids=group_cols,

    static_categoricals=static_categoricals,

    time_varying_known_categoricals=time_varying_known_categoricals,
    time_varying_known_reals=time_varying_known_reals,

    time_varying_unknown_reals=time_varying_unknown_reals,

    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,

    target_normalizer=None,

    categorical_encoders={
        col: NaNLabelEncoder(add_nan=True)
        for col in (static_categoricals + time_varying_known_categoricals)
    },
)




In [114]:
group_cols = ["SILHOUETTE", "SEASON_CONSOLIDATION"]


In [115]:
categorical_cols = [
    "SALES_ORG_NAME", "SILHOUETTE", "SEASON_CONSOLIDATION",
    "REGION", "SPORT", "TEAM", "FABRIC_TYPE", "DIVISION_NAME"
]

for c in categorical_cols:
    df_filled[c] = df_filled[c].astype("category")



In [116]:
test_start = pd.Timestamp("2024-01-01")

train_df = df_filtered[df_filtered["MONTH"] < test_start].copy()
test_df  = df_filtered[df_filtered["MONTH"] >= test_start].copy()


In [117]:
# group_id used by TFT
group_cols = ["SILHOUETTE", "SEASON_CONSOLIDATION"]

# static (one value per series)
static_categoricals = ["SILHOUETTE", "SEASON_CONSOLIDATION"]

# known categorical features at future time
time_varying_known_categoricals = [
    "SALES_ORG_NAME", "REGION", "FABRIC_TYPE", "SPORT", "DIVISION_NAME"
]

# known numeric time features (can be forecasted into future)
time_varying_known_reals = ["month_num", "quarter"]

# unknown numeric features (only available historically)
time_varying_unknown_reals = [
    "ORDERED_QUANTITY",
    "TEAM_COUNT", "SPORT_COUNT", "FABRIC_COUNT", "DIVISION_COUNT"
]

target = "ORDERED_QUANTITY"
max_encoder_length = 36
max_prediction_length = 12


In [120]:
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder

training_dataset = TimeSeriesDataSet(
    df_filtered,                       # use full filtered dataset
    time_idx="time_idx",
    target="ORDERED_QUANTITY",
    group_ids=["SILHOUETTE", "SEASON_CONSOLIDATION"],

    # categorical groups
    static_categoricals=["SILHOUETTE", "SEASON_CONSOLIDATION"],
    time_varying_known_categoricals=["SALES_ORG_NAME", "REGION", "FABRIC_TYPE", "SPORT", "DIVISION_NAME"],

    # continuous features
    time_varying_known_reals=["month_num", "quarter"],
    time_varying_unknown_reals=["ORDERED_QUANTITY", "TEAM_COUNT", "SPORT_COUNT", "FABRIC_COUNT", "DIVISION_COUNT"],

    max_encoder_length=36,
    max_prediction_length=12,

    target_normalizer=None,

    categorical_encoders={
        col: NaNLabelEncoder(add_nan=True)
        for col in (
            ["SILHOUETTE", "SEASON_CONSOLIDATION"]
            + ["SALES_ORG_NAME", "REGION", "FABRIC_TYPE", "SPORT", "DIVISION_NAME"]
        )
    },

    allow_missing_timesteps=True,
)


In [121]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    training_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4
)


In [122]:
import random
import numpy as np
import torch
import pytorch_lightning as pl

def generate_and_set_seed(seed=None):
    if seed is None:
        seed = random.randint(0, 2**32 - 1)

    print(f"\nðŸ”’ USING TRAINING SEED: {seed}\n")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    pl.seed_everything(seed, workers=True)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    return seed

seed = generate_and_set_seed()


Global seed set to 577744747



ðŸ”’ USING TRAINING SEED: 577744747



In [123]:
from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss


In [128]:
from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss

tft = TemporalFusionTransformer.from_dataset(
    training_dataset,
    learning_rate=0.003,
    hidden_size=32,               # REQUIRED
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=16,

    loss=QuantileLoss([0.1, 0.5, 0.9]),

    reduce_on_plateau_patience=4,
)


In [129]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(
    monitor="train_loss",
    min_delta=1e-4,
    patience=5,
    verbose=True,
    mode="min"
)

checkpoint = ModelCheckpoint(
    dirpath="tft_checkpoints/",
    filename="tft_best",
    save_top_k=1,
    monitor="train_loss",
    mode="min"
)


In [132]:
from pytorch_lightning.loggers import CSVLogger

csv_logger = CSVLogger("tft_logs", name="tft")

trainer = pl.Trainer(
    max_epochs=40,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    gradient_clip_val=0.1,
    callbacks=[early_stop, checkpoint],
    logger=csv_logger,      # <--- IMPORTANT
    log_every_n_steps=20
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [133]:
trainer.fit(
    tft,
    train_dataloaders=train_dataloader
)


  rank_zero_warn(

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 449   
3  | prescalers                         | ModuleDict                      | 224   
4  | static_variable_selection          | VariableSelectionNetwork        | 238   
5  | encoder_variable_selection         | VariableSelectionNetwork        | 15.8 K
6  | decoder_variable_selection         | VariableSelectionNetwork        | 5.0 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_context_initial_cell_lstm   | GatedResidualNetwork

Epoch 0:   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | 0/305 [00:00<?, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 61, in fetch
    return self.collate_fn(data)
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 143, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 143, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 128, in collate
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 128, in <dictcomp>
    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 120, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/anaconda/envs/nec10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


# Model TFT 2

In [134]:
import pandas as pd
import numpy as np

from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_forecasting.data.encoders import NaNLabelEncoder

from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

import torch


In [135]:
df = df_main.copy()

# Ensure MONTH is datetime monthly timestamp
df["MONTH"] = pd.to_datetime(df["MONTH"])

# Create integer time index
df = df.sort_values("MONTH")
df["time_idx"] = (df["MONTH"] - df["MONTH"].min()).dt.days // 30


In [136]:
group_id = ["SILHOUETTE", "SEASON_CONSOLIDATION"]
group_cols = ["SILHOUETTE", "SEASON_CONSOLIDATION"]
target = "ORDERED_QUANTITY"


In [137]:
static_categoricals = ["SILHOUETTE", "SEASON_CONSOLIDATION"]
time_varying_known_categoricals = ["REGION", "SALES_ORG_NAME", "FABRIC_TYPE", "TEAM", "SPORT", "DIVISION_NAME"]
time_varying_known_reals = ["time_idx"]
time_varying_unknown_reals = ["ORDERED_QUANTITY"]


In [138]:
min_length = 36 + 12  # 48

# Count months per group
group_lengths = df.groupby(group_cols)["time_idx"].nunique().reset_index(name="length")

# Keep only groups >= 48 months
valid_groups = group_lengths[group_lengths["length"] >= min_length][group_cols]

# Filter main dataframe
df = df.merge(valid_groups, on=group_cols, how="inner")

print("Valid groups:", valid_groups.shape[0])
print("Remaining rows:", df.shape)


Valid groups: 18
Remaining rows: (6080224, 11)


In [139]:
max_time_idx = df["time_idx"].max()

train_df = df[df["time_idx"] <= max_time_idx - 12]
test_df  = df[df["time_idx"] >  max_time_idx - 12]


In [141]:
training_dataset = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target=target,
    group_ids=group_cols,

    # Static features
    static_categoricals=static_categoricals,

    # Known (future available)
    time_varying_known_categoricals=time_varying_known_categoricals,
    time_varying_known_reals=time_varying_known_reals,

    # Unknown (observed only in encoder)
    time_varying_unknown_reals=time_varying_unknown_reals,

    max_encoder_length=36,
    max_prediction_length=12,

    target_normalizer=None,
    
    categorical_encoders={
        col: NaNLabelEncoder(add_nan=True)
        for col in (static_categoricals + time_varying_known_categoricals)
    },

    allow_missing_timesteps=True

)


In [146]:
batch_size = 64

train_dataloader = training_dataset.to_dataloader(
    train=True,
    batch_size=batch_size,
    num_workers=1,
)


In [147]:
tft = TemporalFusionTransformer.from_dataset(
    training_dataset,
    learning_rate=0.003,
    hidden_size=32,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=16,

    loss=QuantileLoss([0.1, 0.5, 0.9]),
    reduce_on_plateau_patience=4,
)


In [148]:
logger = CSVLogger("tft_logs", name="tft_model")

trainer = Trainer(
    max_epochs=30,
    accelerator="cpu",   # Use "gpu" if available
    logger=logger,
    enable_checkpointing=True
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [149]:
trainer.fit(
    tft,
    train_dataloaders=train_dataloader
)



   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 112 K 
3  | prescalers                         | ModuleDict                      | 64    
4  | static_variable_selection          | VariableSelectionNetwork        | 238   
5  | encoder_variable_selection         | VariableSelectionNetwork        | 5.5 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 3.4 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 4.3 

  rank_zero_warn(


Epoch 0:   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | 0/53713 [00:00<?, ?it/s]

ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 

RuntimeError: DataLoader worker (pid(s) 470221) exited unexpectedly

In [None]:
def forecast_group(silhouette, season):
    df_group = df[df["SILHOUETTE"] == silhouette]
    df_group = df_group[df_group["SEASON_CONSOLIDATION"] == season]

    # Build a new prediction dataset
    pred_dataset = TimeSeriesDataSet.from_dataset(
        training_dataset,
        df_group,
        stop_randomization=True,
        predict=True,
    )

    pred_loader = pred_dataset.to_dataloader(train=False, batch_size=64)

    raw_predictions, x = tft.predict(pred_loader, mode="raw", return_x=True)

    return raw_predictions


In [None]:
pred = forecast_group("5950", "Stock")
