# Time series forecasting

## Score: .2625

In [14]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
os.environ['LOKY_MAX_CPU_COUNT'] = str(os.cpu_count() or 4)

import numpy as np
import pandas as pd
import lightgbm as lgb

In [15]:
# Paths: Kaggle or local
if os.path.exists('/kaggle/input/ts-forecasting/train.parquet'):
    TRAIN_PATH = '/kaggle/input/ts-forecasting/train.parquet'
    TEST_PATH = '/kaggle/input/ts-forecasting/test.parquet'
else:
    TRAIN_PATH = 'ts-forecasting/train.parquet'
    TEST_PATH = 'ts-forecasting/test.parquet'

VAL_THRESHOLD = 3500
# If LB is 0 with id order, set False to write in parquet row order instead
SUBMISSION_BY_ID_ORDER = True


def weighted_rmse_score(y_target, y_pred, w):
    """Competition metric. Higher is better."""
    y_target, y_pred, w = np.array(y_target), np.array(y_pred), np.array(w)
    denom = np.sum(w * (y_target ** 2))
    if denom <= 0:
        return 0.0
    numerator = np.sum(w * ((y_target - y_pred) ** 2))
    ratio = numerator / denom
    return float(np.sqrt(1.0 - np.clip(ratio, 0.0, 1.0)))

In [16]:
# Train stats from data with ts_index <= val_threshold (no look-forward)
temp = pd.read_parquet(TRAIN_PATH, columns=['sub_category', 'sub_code', 'y_target', 'ts_index'])
train_only = temp[temp['ts_index'] <= VAL_THRESHOLD]
train_stats = {
    'sub_category': train_only.groupby('sub_category')['y_target'].mean().to_dict(),
    'sub_code': train_only.groupby('sub_code')['y_target'].mean().to_dict(),
    'global_mean': train_only['y_target'].mean()
}
del temp, train_only
gc.collect()
print('Train stats computed.')

Train stats computed.


In [17]:
t = pd.read_parquet(TRAIN_PATH, columns=['horizon', 'y_target', 'weight', 'ts_index'])
t_fit = t[t['ts_index'] <= VAL_THRESHOLD]
for hz in [1, 3, 10, 25]:
    s = t_fit[t_fit['horizon'] == hz]['y_target']
    print(f"horizon={hz} n={len(s)} mean={s.mean():.4f} std={s.std():.4f}")
print(f"all fit: n={len(t_fit)} mean={t_fit['y_target'].mean():.4f} std={t_fit['y_target'].std():.4f} ts_index=[{t_fit['ts_index'].min()},{t_fit['ts_index'].max()}]")
del t, t_fit
gc.collect()

horizon=1 n=1351193 mean=-0.0841 std=11.7029
horizon=3 n=1342793 mean=-0.2593 std=19.3691
horizon=10 n=1296269 mean=-0.8023 std=33.9953
horizon=25 n=1181897 mean=-1.7543 std=53.3072
all fit: n=5172152 mean=-0.6912 std=32.7503 ts_index=[1,3500]


0

In [18]:
def build_context_features(data, enc_stats=None):
    """Context-aware features: encoded categoricals, interactions, lags, temporal cycle."""
    x = data.copy()
    group_cols = ['code', 'sub_code', 'sub_category', 'horizon']
    x = x.sort_values(group_cols + ['ts_index'])

    if enc_stats is not None:
        for c in ['sub_category', 'sub_code']:
            x[c + '_enc'] = x[c].map(enc_stats[c]).fillna(enc_stats['global_mean']).astype(np.float32)

    # Interaction features
    if 'feature_al' in x.columns and 'feature_am' in x.columns:
        x['d_al_am'] = (x['feature_al'] - x['feature_am']).astype(np.float32)
        x['r_al_am'] = (x['feature_al'] / (x['feature_am'] + 1e-7)).astype(np.float32)
    if 'feature_cg' in x.columns and 'feature_by' in x.columns:
        x['d_cg_by'] = (x['feature_cg'] - x['feature_by']).astype(np.float32)

    top_features = ['feature_al', 'feature_am', 'feature_cg', 'feature_by', 'feature_s']
    for col in top_features:
        if col not in x.columns:
            continue
        for lag in [1, 3, 10]:
            x[f'{col}_lag{lag}'] = x.groupby(group_cols)[col].shift(lag).astype(np.float32)
        x[f'{col}_diff1'] = x.groupby(group_cols)[col].diff(1).astype(np.float32)
        for window in [5, 10]:
            x[f'{col}_roll{window}'] = x.groupby(group_cols)[col].transform(
                lambda s: s.rolling(window, min_periods=1).mean()
            ).astype(np.float32)
            x[f'{col}_rollstd{window}'] = x.groupby(group_cols)[col].transform(
                lambda s: s.rolling(window, min_periods=1).std()
            ).astype(np.float32)
        x[f'{col}_ewm5'] = x.groupby(group_cols)[col].transform(
            lambda s: s.ewm(span=5, adjust=False).mean()
        ).astype(np.float32)

    x['t_cycle'] = np.sin(2 * np.pi * x['ts_index'].astype(np.float32) / 100).astype(np.float32)
    return x


LGB_CFG = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.015,
    'n_estimators': 6000,
    'num_leaves': 128,
    'min_child_samples': 120,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 10.0,
    'verbosity': -1,
}
SEEDS = (42, 2024, 7, 11, 999, 123, 456)
N_SEEDS = 7


In [19]:
forecast_windows = [1, 3, 10, 25]
test_outputs = []
cv_cache = {'y': [], 'pred': [], 'wt': []}

for hz in forecast_windows:
    print(f'\n>>> Training horizon = {hz}')
    tr_df = build_context_features(
        pd.read_parquet(TRAIN_PATH).query(f'horizon == {hz}'),
        train_stats
    )
    te_df = build_context_features(
        pd.read_parquet(TEST_PATH).query(f'horizon == {hz}'),
        train_stats
    )

    feature_cols = [
        c for c in tr_df.columns
        if c not in {'id', 'code', 'sub_code', 'sub_category', 'horizon', 'ts_index', 'weight', 'y_target'}
    ]
    for c in feature_cols:
        tr_df[c] = tr_df[c].fillna(0).astype(np.float32)
        te_df[c] = te_df[c].fillna(0).astype(np.float32)

    fit_mask = tr_df['ts_index'] <= VAL_THRESHOLD
    val_mask = ~fit_mask

    X_fit = tr_df.loc[fit_mask, feature_cols]
    y_fit = tr_df.loc[fit_mask, 'y_target'].values.astype(np.float64)
    w_fit = tr_df.loc[fit_mask, 'weight'].values.astype(np.float64)
    fit_ts = tr_df.loc[fit_mask, 'ts_index'].values.astype(np.float64)
    scale = 0.5 + 0.5 * (fit_ts / fit_ts.max())
    w_fit_eff = w_fit * scale
    q_low, q_high = np.percentile(y_fit, [0.1, 99.9])
    y_fit_eff = np.clip(y_fit, q_low, q_high)
    X_hold = tr_df.loc[val_mask, feature_cols]
    y_hold = tr_df.loc[val_mask, 'y_target']
    w_hold = tr_df.loc[val_mask, 'weight']

    val_pred = np.zeros(len(y_hold), dtype=np.float64)
    tst_pred = np.zeros(len(te_df), dtype=np.float64)

    cfg = dict(LGB_CFG)
    if hz in (10, 25):
        cfg = {**cfg, 'learning_rate': 0.012, 'n_estimators': 7500, 'num_leaves': 160,
               'min_child_samples': 100, 'lambda_l2': 15.0}

    for seed in SEEDS:
        mdl = lgb.LGBMRegressor(**cfg, random_state=seed, n_jobs=-1)
        mdl.fit(
            X_fit, y_fit_eff,
            sample_weight=w_fit_eff,
            eval_set=[(X_hold, y_hold)],
            eval_sample_weight=[w_hold],
            callbacks=[lgb.early_stopping(300, verbose=False)]
        )
        val_pred += mdl.predict(X_hold).astype(np.float64) / N_SEEDS
        tst_pred += mdl.predict(te_df[feature_cols]).astype(np.float64) / N_SEEDS
        del mdl
        gc.collect()

    print(f'Horizon {hz} Score: {weighted_rmse_score(y_hold, val_pred, w_hold):.5f}')

    cv_cache['y'].extend(y_hold.tolist())
    cv_cache['pred'].extend(val_pred.tolist())
    cv_cache['wt'].extend(w_hold.tolist())

    test_outputs.append(pd.DataFrame({'id': te_df['id'].values, 'prediction': tst_pred}))

    del tr_df, te_df, X_fit, y_fit, w_fit, X_hold, y_hold, w_hold
    gc.collect()


>>> Training horizon = 1
Horizon 1 Score: 0.08931

>>> Training horizon = 3
Horizon 3 Score: 0.12833

>>> Training horizon = 10
Horizon 10 Score: 0.22463

>>> Training horizon = 25
Horizon 25 Score: 0.24478


In [20]:
final_metric = weighted_rmse_score(cv_cache['y'], cv_cache['pred'], cv_cache['wt'])
print(f"\n{'='*40}\nFINAL AGGREGATE SCORE: {final_metric:.6f}\n{'='*40}")

# Submission: write in deterministic ID order + UTF-8 so grader gets same format everywhere
test_full = pd.read_parquet(TEST_PATH)
pred_dict = {}
for df in test_outputs:
    for i in range(len(df)):
        k = str(df['id'].iloc[i]).strip()
        pred_dict[k] = float(df['prediction'].iloc[i])

test_ids = test_full['id'].astype(str).str.strip()
missing = test_ids[~test_ids.isin(pred_dict)].unique()
if len(missing) > 0:
    raise RuntimeError(f"{len(missing)} test ids not in pred_dict (e.g. {list(missing[:3])})")
if len(pred_dict) != len(test_full):
    raise RuntimeError(f"pred_dict has {len(pred_dict)} keys but test has {len(test_full)} rows")

# Fix: deterministic row order. Id order works if grader merges by id or test is id-sorted.
if SUBMISSION_BY_ID_ORDER:
    test_out = test_full.sort_values(by='id', key=lambda col: col.astype(str)).reset_index(drop=True)
else:
    test_out = test_full.reset_index(drop=True)
pred_vals = np.array([pred_dict[str(test_out['id'].iloc[i]).strip()] for i in range(len(test_out))])
if np.std(pred_vals) < 1e-6:
    raise RuntimeError(f"Predictions nearly constant (std={np.std(pred_vals):.2e}). Fix training before submitting.")
if np.all(np.abs(pred_vals) < 1e-6):
    raise RuntimeError("Predictions all ~0. Fix training before submitting.")

with open('submission.csv', 'w', newline='', encoding='utf-8') as f:
    f.write('id,prediction\n')
    for i in range(len(test_out)):
        id_val = test_out['id'].iloc[i]
        key = str(id_val).strip()
        pred = pred_dict[key]
        if np.isnan(pred):
            raise RuntimeError(f"NaN prediction for id {id_val}")
        f.write(f"{id_val},{pred:.10f}\n")

submission = pd.read_csv('submission.csv', encoding='utf-8')
print(f'Submission shape: {submission.shape} (rows in id order)')
print('First 10 rows:')
print(submission.head(10))


FINAL AGGREGATE SCORE: 0.219344
Submission shape: (1447107, 2) (rows in id order)
First 10 rows:
                                       id  prediction
0  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4175   -0.335566
1  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4176   -0.418452
2  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4177   -0.433591
3  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4178   -0.505239
4  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4179   -0.532263
5  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4180   -0.546320
6  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4182   -0.602292
7  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4183   -0.574202
8  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4184   -0.585841
9  10BAVIDU__07YQ9WA4__DPPUO5X2__10__4185   -0.629587
