In [3]:
import pandas as pd
import numpy as np

# ===============================
# PATHS
# ===============================
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"

OUT_TRAIN = "train_processed.csv"
OUT_TEST = "test_processed.csv"

ROLLING_WINDOW = 7
INVENTORY_DAYS = 7
SAFETY_FACTOR = 1.2

# ===============================
# LOAD DATA
# ===============================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# ===============================
# DATE HANDLING
# ===============================
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# ===============================
# PRODUCT ID
# ===============================
train['product_id'] = train['store'].astype(str) + "_" + train['item'].astype(str)
test['product_id'] = test['store'].astype(str) + "_" + test['item'].astype(str)

# ===============================
# SORT (CRITICAL)
# ===============================
train = train.sort_values(['product_id', 'date'])
test = test.sort_values(['product_id', 'date'])

# ===============================
# TRAIN FEATURE ENGINEERING
# ===============================
train['rolling_avg_sales'] = (
    train.groupby('product_id')['sales']
         .transform(lambda x: x.rolling(ROLLING_WINDOW, min_periods=1).mean())
)

train['lag_1'] = train.groupby('product_id')['sales'].shift(1)
train['lag_7'] = train.groupby('product_id')['sales'].shift(7)

train['inventory_level'] = (
    train['rolling_avg_sales'] * INVENTORY_DAYS * SAFETY_FACTOR
).fillna(0).astype(int)

# ===============================
# TEST FEATURE ENGINEERING
# (NO SALES, USE TRAIN HISTORY)
# ===============================
last_known_sales = (
    train.groupby('product_id')
         .tail(ROLLING_WINDOW)
         .groupby('product_id')['sales']
         .mean()
)

test['rolling_avg_sales'] = test['product_id'].map(last_known_sales)

test['inventory_level'] = (
    test['rolling_avg_sales'] * INVENTORY_DAYS * SAFETY_FACTOR
).fillna(0).astype(int)

# ===============================
# SUPPLIER DELAY (SIMULATED)
# ===============================
np.random.seed(42)

train['supplier_delay'] = np.random.choice(
    [0, 1, 2, 3, 5],
    size=len(train),
    p=[0.60, 0.20, 0.10, 0.07, 0.03]
)

test['supplier_delay'] = np.random.choice(
    [0, 1, 2, 3, 5],
    size=len(test),
    p=[0.60, 0.20, 0.10, 0.07, 0.03]
)

# ===============================
# FINAL DATASETS
# ===============================
train_final = train[
    ['date', 'product_id', 'sales', 'rolling_avg_sales',
     'lag_1', 'lag_7', 'inventory_level', 'supplier_delay']
]

test_final = test[
    ['id', 'date', 'product_id', 'rolling_avg_sales',
     'inventory_level', 'supplier_delay']
]

# ===============================
# SAVE
# ===============================
train_final.to_csv(OUT_TRAIN, index=False)
test_final.to_csv(OUT_TEST, index=False)

print("✅ Train and Test preprocessing completed")
print("Train shape:", train_final.shape)
print("Test shape:", test_final.shape)



✅ Train and Test preprocessing completed
Train shape: (913000, 8)
Test shape: (45000, 6)
