# XGBoost

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import os

In [None]:
import pandas as pd

# Loading the data 
df = pd.read_feather("/kaggle/input/df-feather/df.feather")
df['store_id'] = df['store_id'].astype(str)  

In [None]:
# separate df.feather by store
for store_id in df['store_id'].unique():
    df_store = df[df['store_id'] == store_id].reset_index(drop=True)
    df_store.to_feather(f"store_{store_id}.feather")

In [5]:
# ===== PARAMETERS =====
FORECAST_DAYS = 28
TARGET_COL = 'sold'
EXCLUDED_COLS = ['id', 'd', 'sold', 'date']
EVAL_DAYS = [f'd_{i}' for i in range(1942, 1970)]

In [6]:
# ===== LOAD STATIC FILES =====
calendar = pd.read_csv("/kaggle/input/calendar-csv/calendar.csv")
sample_submission = pd.read_csv("/kaggle/input/sample-submission-csv/sample_submission.csv")

In [7]:
# ===== EVALUATION ROW BUILDER =====
from tqdm import tqdm
import time

def generate_evaluation_rows(df_store, calendar, sample_submission):
    start_time = time.time()
    print("🔧 [Start] Generating evaluation rows...")

    sub_eval = sample_submission[sample_submission['id'].str.endswith('_evaluation')].copy()
    sub_eval['item_id'] = sub_eval['id'].apply(lambda x: "_".join(x.split("_")[:3]))
    sub_eval['store_id'] = sub_eval['id'].apply(lambda x: "_".join(x.split("_")[3:-1]))

    store_id = df_store['store_id'].iloc[0]
    sub_eval = sub_eval[sub_eval['store_id'] == store_id]

    calendar_eval = calendar[calendar['d'].isin(EVAL_DAYS)][['d', 'wm_yr_wk', 'event_name_1', 'event_type_1',
                                                             'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'date']]

    eval_rows = []
    for _, row in tqdm(sub_eval.iterrows(), total=len(sub_eval), desc=f"🛠 Building eval rows for {store_id}"):
        base = df_store[(df_store['item_id'] == row['item_id']) & (df_store['store_id'] == row['store_id'])]
        if base.empty:
            continue
        base = base.sort_values('d').iloc[-1:].copy()
        for d in EVAL_DAYS:
            temp = base.copy()
            temp['d'] = d
            temp['id'] = row['id']
            eval_rows.append(temp)

    df_eval = pd.concat(eval_rows, ignore_index=True)
    df_eval = df_eval.merge(calendar_eval, on='d', how='left')

    elapsed = time.time() - start_time
    print(f"✅ [Done] Eval rows for {store_id} built in {elapsed:.2f} seconds.")
    return df_eval

In [8]:
# ===== MERGE PREDICTED OUTPUT =====
def merge_preds(pred_list):
    df_merge = pred_list[0]
    for t in range(1, FORECAST_DAYS):
        df_merge = df_merge.merge(pred_list[t], on='id')
    return df_merge

In [18]:
# ===== MAIN PIPELINE LOOP =====
store_files = [f for f in os.listdir() if f.startswith("store_") and f.endswith(".feather")]

for file in store_files:
    store_id = file.replace("store_", "").replace(".feather", "")
    print(f"\n==========================")
    print(f"🛒 Starting store: {store_id}")
    print("==========================")

    df_store = pd.read_feather(file)
    df_store['d'] = df_store['d'].astype(str)

    print("🔧 Generating evaluation rows...")
    df_eval = generate_evaluation_rows(df_store, calendar, sample_submission)
    df_store = pd.concat([df_store, df_eval], ignore_index=True)

    print("🧼 Encoding categorical + scaling numeric features...")
    cat_cols = [col for col in df_store.select_dtypes(include='category') if col not in EXCLUDED_COLS]
    num_cols = [col for col in df_store.select_dtypes(include=['float', 'int']) if col not in EXCLUDED_COLS]
    encoders = {col: LabelEncoder().fit(df_store[col]) for col in cat_cols}
    for col in cat_cols:
        df_store[col] = encoders[col].transform(df_store[col])
    scaler = MinMaxScaler()
    df_store[num_cols] = scaler.fit_transform(df_store[num_cols])
    features = cat_cols + num_cols

    val_preds, eval_preds = [], []

    for t in range(1, FORECAST_DAYS + 1):
        d_val = f'd_{1913 + t}'
        d_eval = f'd_{1941 + t}'
        print(f"📅 Forecasting F{t:02d} (val: {d_val}, eval: {d_eval})")

        # Validation
        train_val = df_store[df_store['d'] < d_val]
        test_val = df_store[df_store['d'] == d_val]
        if test_val.empty:
            print("  ⚠️ Skipped validation day (no data)")
            continue
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6,
                                 learning_rate=0.1, subsample=0.8, colsample_bytree=0.8,
                                 tree_method='hist', verbosity=0)
        model.fit(train_val[features], train_val[TARGET_COL])
        y_pred_val = model.predict(test_val[features]).ravel()
        id_val = test_val['id'].str.replace("_evaluation", "_validation").values.ravel()
        val_preds.append(pd.DataFrame({'id': id_val, f'F{t}': y_pred_val}))

        # Evaluation
        train_eval = df_store[df_store['d'] < d_eval]
        test_eval = df_store[df_store['d'] == d_eval]
        if test_eval.empty:
            print("  ⚠️ Skipped evaluation day (no data)")
            continue
        model.fit(train_eval[features], train_eval[TARGET_COL])
        y_pred_eval = model.predict(test_eval[features]).ravel()
        id_eval = test_eval['id'].values.ravel()
        eval_preds.append(pd.DataFrame({'id': id_eval, f'F{t}': y_pred_eval}))

    # Save outputs
    print(f"💾 Saving outputs for store {store_id}...")
    submission_val = merge_preds(val_preds)
    submission_eval = merge_preds(eval_preds)
    submission_val.to_csv(f"val_{store_id}.csv", index=False)
    submission_eval.to_csv(f"eval_{store_id}.csv", index=False)
    print(f"✅ Done: val_{store_id}.csv & eval_{store_id}.csv")


🛒 Starting store: WI
🔧 Generating evaluation rows...
🔧 [Start] Generating evaluation rows...


🛠 Building eval rows for WI_3: 100%|██████████| 3049/3049 [15:35<00:00,  3.26it/s]


✅ [Done] Eval rows for WI_3 built in 973.97 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for CA_2: 100%|██████████| 3049/3049 [15:28<00:00,  3.28it/s]


✅ [Done] Eval rows for CA_2 built in 966.74 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for WI_2: 100%|██████████| 3049/3049 [15:30<00:00,  3.28it/s]


✅ [Done] Eval rows for WI_2 built in 968.94 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for CA_1: 100%|██████████| 3049/3049 [15:30<00:00,  3.28it/s]


✅ [Done] Eval rows for CA_1 built in 968.94 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for CA_3: 100%|██████████| 3049/3049 [15:31<00:00,  3.27it/s]


✅ [Done] Eval rows for CA_3 built in 974.85 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for TX_3: 100%|██████████| 3049/3049 [15:28<00:00,  3.28it/s]


✅ [Done] Eval rows for TX_3 built in 967.74 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for CA_4: 100%|██████████| 3049/3049 [15:28<00:00,  3.28it/s]


✅ [Done] Eval rows for CA_4 built in 967.06 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for TX_2: 100%|██████████| 3049/3049 [15:28<00:00,  3.28it/s]


✅ [Done] Eval rows for TX_2 built in 967.02 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for TX_1: 100%|██████████| 3049/3049 [15:22<00:00,  3.31it/s]


✅ [Done] Eval rows for TX_1 built in 955.93 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v

🛠 Building eval rows for WI_1: 100%|██████████| 3049/3049 [15:35<00:00,  3.26it/s]


✅ [Done] Eval rows for WI_1 built in 972.87 seconds.
🧼 Encoding categorical + scaling numeric features...
📅 Forecasting F01 (val: d_1914, eval: d_1942)
📅 Forecasting F02 (val: d_1915, eval: d_1943)
📅 Forecasting F03 (val: d_1916, eval: d_1944)
📅 Forecasting F04 (val: d_1917, eval: d_1945)
📅 Forecasting F05 (val: d_1918, eval: d_1946)
📅 Forecasting F06 (val: d_1919, eval: d_1947)
📅 Forecasting F07 (val: d_1920, eval: d_1948)
📅 Forecasting F08 (val: d_1921, eval: d_1949)
📅 Forecasting F09 (val: d_1922, eval: d_1950)
📅 Forecasting F10 (val: d_1923, eval: d_1951)
📅 Forecasting F11 (val: d_1924, eval: d_1952)
📅 Forecasting F12 (val: d_1925, eval: d_1953)
📅 Forecasting F13 (val: d_1926, eval: d_1954)
📅 Forecasting F14 (val: d_1927, eval: d_1955)
📅 Forecasting F15 (val: d_1928, eval: d_1956)
📅 Forecasting F16 (val: d_1929, eval: d_1957)
📅 Forecasting F17 (val: d_1930, eval: d_1958)
📅 Forecasting F18 (val: d_1931, eval: d_1959)
📅 Forecasting F19 (val: d_1932, eval: d_1960)
📅 Forecasting F20 (v