# Time series forecasting

## Score: .2618

In [1]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
os.environ['LOKY_MAX_CPU_COUNT'] = str(os.cpu_count() or 4)

import numpy as np
import pandas as pd
import lightgbm as lgb

In [2]:
# Paths: Kaggle or local
if os.path.exists('/kaggle/input/ts-forecasting/train.parquet'):
    TRAIN_PATH = '/kaggle/input/ts-forecasting/train.parquet'
    TEST_PATH = '/kaggle/input/ts-forecasting/test.parquet'
else:
    TRAIN_PATH = 'ts-forecasting/train.parquet'
    TEST_PATH = 'ts-forecasting/test.parquet'

VAL_THRESHOLD = 3500


def weighted_rmse_score(y_target, y_pred, w):
    """Competition metric. Higher is better."""
    y_target, y_pred, w = np.array(y_target), np.array(y_pred), np.array(w)
    denom = np.sum(w * (y_target ** 2))
    if denom <= 0:
        return 0.0
    numerator = np.sum(w * ((y_target - y_pred) ** 2))
    ratio = numerator / denom
    return float(np.sqrt(1.0 - np.clip(ratio, 0.0, 1.0)))

In [3]:
# Train stats from data with ts_index <= val_threshold (no look-forward)
temp = pd.read_parquet(TRAIN_PATH, columns=['sub_category', 'sub_code', 'y_target', 'ts_index'])
train_only = temp[temp['ts_index'] <= VAL_THRESHOLD]
train_stats = {
    'sub_category': train_only.groupby('sub_category')['y_target'].mean().to_dict(),
    'sub_code': train_only.groupby('sub_code')['y_target'].mean().to_dict(),
    'global_mean': train_only['y_target'].mean()
}
del temp, train_only
gc.collect()
print('Train stats computed.')

Train stats computed.


In [4]:
def build_context_features(data, enc_stats=None):
    """Context-aware features: encoded categoricals, interactions, lags, temporal cycle."""
    x = data.copy()
    group_cols = ['code', 'sub_code', 'sub_category', 'horizon']
    x = x.sort_values(group_cols + ['ts_index'])

    if enc_stats is not None:
        for c in ['sub_category', 'sub_code']:
            x[c + '_enc'] = x[c].map(enc_stats[c]).fillna(enc_stats['global_mean']).astype(np.float32)

    # Interaction features
    if 'feature_al' in x.columns and 'feature_am' in x.columns:
        x['d_al_am'] = (x['feature_al'] - x['feature_am']).astype(np.float32)
        x['r_al_am'] = (x['feature_al'] / (x['feature_am'] + 1e-7)).astype(np.float32)
    if 'feature_cg' in x.columns and 'feature_by' in x.columns:
        x['d_cg_by'] = (x['feature_cg'] - x['feature_by']).astype(np.float32)

    top_features = ['feature_al', 'feature_am', 'feature_cg', 'feature_by', 'feature_s']
    for col in top_features:
        if col not in x.columns:
            continue
        for lag in [1, 3, 10]:
            x[f'{col}_lag{lag}'] = x.groupby(group_cols)[col].shift(lag).astype(np.float32)
        x[f'{col}_diff1'] = x.groupby(group_cols)[col].diff(1).astype(np.float32)
        for window in [5, 10]:
            x[f'{col}_roll{window}'] = x.groupby(group_cols)[col].transform(
                lambda s: s.rolling(window, min_periods=1).mean()
            ).astype(np.float32)
            x[f'{col}_rollstd{window}'] = x.groupby(group_cols)[col].transform(
                lambda s: s.rolling(window, min_periods=1).std()
            ).astype(np.float32)
        x[f'{col}_ewm5'] = x.groupby(group_cols)[col].transform(
            lambda s: s.ewm(span=5, adjust=False).mean()
        ).astype(np.float32)

    x['t_cycle'] = np.sin(2 * np.pi * x['ts_index'].astype(np.float32) / 100).astype(np.float32)
    return x


LGB_CFG = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.015,
    'n_estimators': 4000,
    'num_leaves': 80,
    'min_child_samples': 200,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 10.0,
    'verbosity': -1,
}
SEEDS = (42, 2024, 7, 11, 999)
N_SEEDS = 5

In [5]:
forecast_windows = [1, 3, 10, 25]
test_outputs = []
cv_cache = {'y': [], 'pred': [], 'wt': []}

for hz in forecast_windows:
    print(f'\n>>> Training horizon = {hz}')
    tr_df = build_context_features(
        pd.read_parquet(TRAIN_PATH).query(f'horizon == {hz}'),
        train_stats
    )
    te_df = build_context_features(
        pd.read_parquet(TEST_PATH).query(f'horizon == {hz}'),
        train_stats
    )

    feature_cols = [
        c for c in tr_df.columns
        if c not in {'id', 'code', 'sub_code', 'sub_category', 'horizon', 'ts_index', 'weight', 'y_target'}
    ]
    for c in feature_cols:
        tr_df[c] = tr_df[c].fillna(0).astype(np.float32)
        te_df[c] = te_df[c].fillna(0).astype(np.float32)

    fit_mask = tr_df['ts_index'] <= VAL_THRESHOLD
    val_mask = ~fit_mask

    X_fit = tr_df.loc[fit_mask, feature_cols]
    y_fit = tr_df.loc[fit_mask, 'y_target']
    w_fit = tr_df.loc[fit_mask, 'weight']

    X_hold = tr_df.loc[val_mask, feature_cols]
    y_hold = tr_df.loc[val_mask, 'y_target']
    w_hold = tr_df.loc[val_mask, 'weight']

    val_pred = np.zeros(len(y_hold), dtype=np.float64)
    tst_pred = np.zeros(len(te_df), dtype=np.float64)

    for seed in SEEDS:
        mdl = lgb.LGBMRegressor(**LGB_CFG, random_state=seed, n_jobs=-1)
        mdl.fit(
            X_fit, y_fit,
            sample_weight=w_fit,
            eval_set=[(X_hold, y_hold)],
            eval_sample_weight=[w_hold],
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        val_pred += mdl.predict(X_hold).astype(np.float64) / N_SEEDS
        tst_pred += mdl.predict(te_df[feature_cols]).astype(np.float64) / N_SEEDS
        del mdl
        gc.collect()

    print(f'Horizon {hz} Score: {weighted_rmse_score(y_hold, val_pred, w_hold):.5f}')

    cv_cache['y'].extend(y_hold.tolist())
    cv_cache['pred'].extend(val_pred.tolist())
    cv_cache['wt'].extend(w_hold.tolist())

    test_outputs.append(pd.DataFrame({'id': te_df['id'].values, 'prediction': tst_pred}))

    del tr_df, te_df, X_fit, y_fit, w_fit, X_hold, y_hold, w_hold
    gc.collect()


>>> Training horizon = 1
Horizon 1 Score: 0.06690

>>> Training horizon = 3
Horizon 3 Score: 0.12382

>>> Training horizon = 10
Horizon 10 Score: 0.21332

>>> Training horizon = 25
Horizon 25 Score: 0.25960


In [6]:
final_metric = weighted_rmse_score(cv_cache['y'], cv_cache['pred'], cv_cache['wt'])
print(f"\n{'='*40}\nFINAL AGGREGATE SCORE: {final_metric:.6f}\n{'='*40}")

# Submission: MUST be in exact test.parquet row order (row i = test row i)
test_full = pd.read_parquet(TEST_PATH)
pred_dict = {}
for df in test_outputs:
    for i in range(len(df)):
        k = str(df['id'].iloc[i]).strip()
        pred_dict[k] = float(df['prediction'].iloc[i])

# Normalize test ids the same way for lookup
test_ids = test_full['id'].astype(str).str.strip()
missing = test_ids[~test_ids.isin(pred_dict)].unique()
if len(missing) > 0:
    raise RuntimeError(f"{len(missing)} test ids not in pred_dict (e.g. {list(missing[:3])})")
if len(pred_dict) != len(test_full):
    raise RuntimeError(f"pred_dict has {len(pred_dict)} keys but test has {len(test_full)} rows")

# Build submission in test row order and write row-by-row (no reorder)
with open('submission.csv', 'w', newline='') as f:
    f.write('id,prediction\n')
    for i in range(len(test_full)):
        id_val = test_full['id'].iloc[i]
        key = str(id_val).strip()
        pred = pred_dict[key]
        if np.isnan(pred):
            raise RuntimeError(f"NaN prediction for id {id_val}")
        f.write(f"{id_val},{pred:.10f}\n")

submission = pd.read_csv('submission.csv')
print(f'Submission shape: {submission.shape}')
print('First 10 rows (must match test row order):')
print(submission.head(10))


FINAL AGGREGATE SCORE: 0.223828
Submission shape: (1447107, 2)
First 10 rows (must match test row order):
                                       id  prediction
0   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3647   -0.053746
1  W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3647   -0.103430
2  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3647   -0.362948
3   W2MW3G2L__495MGHFJ__PZ9S1Z4V__1__3647   -0.008905
4  W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3648   -0.112227
5  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3648   -0.454240
6   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3648   -0.044495
7   W2MW3G2L__495MGHFJ__PZ9S1Z4V__1__3648   -0.011438
8   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3649   -0.044476
9  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3649   -0.434736
