# Time series forecasting

## Score: .2154

In [7]:
import os
import gc
import warnings
os.environ['LOKY_MAX_CPU_COUNT'] = str(os.cpu_count() or 4)
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [8]:
TRAIN_PATH = 'ts-forecasting/train.parquet'
TEST_PATH = 'ts-forecasting/test.parquet'
VAL_THRESHOLD = 3500


def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.float32)
    return df


def weighted_rmse_score(y_target, y_pred, w):
    y_target = np.array(y_target, dtype=np.float64)
    y_pred = np.array(y_pred, dtype=np.float64)
    w = np.array(w, dtype=np.float64)
    denom = np.sum(w * y_target ** 2)
    if denom == 0 or np.isnan(denom):
        return 0.0
    ratio = np.sum(w * (y_target - y_pred) ** 2) / denom
    clipped = np.clip(ratio, 0.0, 1.0)
    return float(np.sqrt(1.0 - clipped))

In [9]:
print('Loading train (minimal columns)...')
train_cols = ['code', 'sub_code', 'sub_category', 'horizon', 'ts_index', 'y_target', 'weight']
train_df = pd.read_parquet(TRAIN_PATH, columns=train_cols)
print(f'Train shape: {train_df.shape}')

global_encodings = {}
for col in ['code', 'sub_code', 'sub_category']:
    freq_map = train_df[col].value_counts(normalize=True).to_dict()
    global_encodings[f'{col}_freq'] = freq_map
    le = LabelEncoder()
    le.fit(train_df[col].astype(str))
    global_encodings[f'{col}_le'] = le
    global_mean = train_df['y_target'].mean()
    counts = train_df[col].value_counts()
    target_sum = train_df.groupby(col)['y_target'].sum()
    alpha = 10
    target_mean = {}
    for cat in counts.index:
        n = counts[cat]
        sum_val = target_sum.get(cat, 0)
        target_mean[cat] = (sum_val + alpha * global_mean) / (n + alpha)
    global_encodings[f'{col}_target_mean'] = target_mean
    global_encodings[f'{col}_global_mean'] = global_mean

print('Encodings computed.')
del train_df
gc.collect()

Loading train (minimal columns)...
Train shape: (5337414, 7)
Encodings computed.


771

In [10]:
def create_features_minimal(df, global_encodings):
    df = df.copy()
    group_cols = ['code', 'sub_code', 'sub_category', 'horizon']
    for col in ['code', 'sub_code', 'sub_category']:
        df[f'{col}_freq'] = df[col].map(global_encodings[f'{col}_freq']).fillna(0.0).astype(np.float32)
        le = global_encodings[f'{col}_le']
        col_values = df[col].astype(str)
        encoded = np.full(len(col_values), -1, dtype=np.int16)
        for i, val in enumerate(col_values):
            if val in le.classes_:
                encoded[i] = le.transform([val])[0]
        df[f'{col}_encoded'] = encoded
        gm = global_encodings[f'{col}_global_mean']
        df[f'{col}_target_mean'] = df[col].map(global_encodings[f'{col}_target_mean']).fillna(gm).astype(np.float32)
    df['horizon_numeric'] = df['horizon'].astype(np.int8)
    df = df.sort_values(group_cols + ['ts_index'])
    df['position'] = df.groupby(group_cols).cumcount().astype(np.int16)
    df['ts_mod_7'] = (df['ts_index'] % 7).astype(np.int8)
    top_features = ['feature_al', 'feature_am', 'feature_cg', 'feature_by', 'feature_s']
    for col in top_features:
        if col not in df.columns:
            continue
        for lag in [1, 3, 10]:
            df[f'{col}_lag{lag}'] = df.groupby(group_cols)[col].shift(lag).astype(np.float32)
        df[f'{col}_diff1'] = df.groupby(group_cols)[col].diff(1).astype(np.float32)
        for window in [5, 10]:
            df[f'{col}_roll{window}'] = df.groupby(group_cols)[col].transform(
                lambda x: x.rolling(window, min_periods=1).mean()
            ).astype(np.float32)
            df[f'{col}_rollstd{window}'] = df.groupby(group_cols)[col].transform(
                lambda x: x.rolling(window, min_periods=1).std()
            ).astype(np.float32)
        df[f'{col}_ewm5'] = df.groupby(group_cols)[col].transform(
            lambda x: x.ewm(span=5, adjust=False).mean()
        ).astype(np.float32)
    if 'feature_al' in df.columns and 'feature_am' in df.columns:
        df['al_x_am'] = (df['feature_al'] * df['feature_am']).astype(np.float32)
    if 'feature_by' in df.columns and 'feature_s' in df.columns:
        df['by_x_s'] = (df['feature_by'] * df['feature_s']).astype(np.float32)
    for c in df.columns:
        if c.startswith('feature_') and df[c].dtype != np.float32:
            df[c] = df[c].astype(np.float32)
    return df


horizon_params = {
    1: {'n_estimators': 3000, 'learning_rate': 0.015, 'max_depth': 9, 'num_leaves': 200, 'min_child_samples': 5},
    3: {'n_estimators': 3000, 'learning_rate': 0.015, 'max_depth': 9, 'num_leaves': 200, 'min_child_samples': 5},
    10: {'n_estimators': 2500, 'learning_rate': 0.02, 'max_depth': 9, 'num_leaves': 150, 'min_child_samples': 5},
    25: {'n_estimators': 2000, 'learning_rate': 0.025, 'max_depth': 9, 'num_leaves': 120, 'min_child_samples': 5}
}

In [11]:
all_val_results = []
all_test_predictions = []

for horizon in [1, 3, 10, 25]:
    print(f'Horizon {horizon}...')
    train_h = pd.read_parquet(TRAIN_PATH)
    train_h = train_h[train_h['horizon'] == horizon].copy()
    train_h = reduce_mem_usage(train_h)
    test_h = pd.read_parquet(TEST_PATH)
    test_h = test_h[test_h['horizon'] == horizon].copy()
    test_h = reduce_mem_usage(test_h)

    train_h = create_features_minimal(train_h, global_encodings)
    test_h = create_features_minimal(test_h, global_encodings)
    exclude = ['id', 'code', 'sub_code', 'sub_category', 'horizon', 'ts_index', 'weight', 'y_target']
    feature_cols = [c for c in train_h.columns if c not in exclude]

    for c in feature_cols:
        train_h[c] = train_h[c].fillna(0).astype(np.float32)
        test_h[c] = test_h[c].fillna(0).astype(np.float32)

    train_mask = train_h['ts_index'] <= VAL_THRESHOLD
    X_train = train_h.loc[train_mask, feature_cols].values.astype(np.float32)
    y_train = train_h.loc[train_mask, 'y_target'].values.astype(np.float32)
    w_train = train_h.loc[train_mask, 'weight'].values.astype(np.float64)
    X_val = train_h.loc[~train_mask, feature_cols].values.astype(np.float32)
    y_val = train_h.loc[~train_mask, 'y_target'].values.astype(np.float32)
    w_val = train_h.loc[~train_mask, 'weight'].values.astype(np.float64)
    X_test = test_h[feature_cols].values.astype(np.float32)
    test_ids = test_h['id'].values

    params = horizon_params[horizon]
    val_preds_list = []
    test_preds_list = []
    for seed in [42, 420, 80085]:
        model = lgb.LGBMRegressor(
            n_estimators=params['n_estimators'], learning_rate=params['learning_rate'],
            max_depth=params['max_depth'], num_leaves=params['num_leaves'],
            min_child_samples=params['min_child_samples'], subsample=0.85, colsample_bytree=0.85,
            reg_alpha=0.1, reg_lambda=0.1, random_state=seed, verbose=-1, n_jobs=-1
        )
        model.fit(X_train, y_train, sample_weight=w_train,
                  eval_set=[(X_val, y_val)], eval_sample_weight=[w_val],
                  callbacks=[lgb.early_stopping(300, verbose=False)])
        val_preds_list.append(model.predict(X_val))
        test_preds_list.append(model.predict(X_test))
        del model
        gc.collect()

    val_pred_avg = np.mean(val_preds_list, axis=0)
    test_pred_avg = np.mean(test_preds_list, axis=0)
    h_score = weighted_rmse_score(y_val, val_pred_avg, w_val)
    print(f'  Val score: {h_score:.6f}')
    all_val_results.append({'horizon': horizon, 'score': h_score, 'y_val': y_val, 'pred_val': val_pred_avg, 'w_val': w_val})
    all_test_predictions.append({'ids': test_ids, 'preds': test_pred_avg})
    del train_h, test_h, X_train, y_train, w_train, X_val, y_val, w_val, X_test
    gc.collect()

Horizon 1...
  Val score: 0.045735
Horizon 3...
  Val score: 0.099050
Horizon 10...
  Val score: 0.173886
Horizon 25...
  Val score: 0.242699


In [12]:
all_y_val = np.concatenate([r['y_val'] for r in all_val_results])
all_pred_val = np.concatenate([r['pred_val'] for r in all_val_results])
all_w_val = np.concatenate([r['w_val'] for r in all_val_results])
overall_score = weighted_rmse_score(all_y_val, all_pred_val, all_w_val)
print(f'Overall validation score: {overall_score:.6f}')
for r in all_val_results:
    print(f"  H{r['horizon']}: {r['score']:.6f}")

all_ids = np.concatenate([p['ids'] for p in all_test_predictions])
all_preds = np.concatenate([p['preds'] for p in all_test_predictions])
submission = pd.DataFrame({'id': all_ids, 'prediction': all_preds})
test_full = pd.read_parquet(TEST_PATH)
submission = submission.set_index('id').reindex(test_full['id']).reset_index()
submission['prediction'] = submission['prediction'].fillna(all_preds.mean())
submission.to_csv('submission.csv', index=False)
print(f'Submission shape: {submission.shape}')
print(submission.head(10))

Overall validation score: 0.200962
  H1: 0.045735
  H3: 0.099050
  H10: 0.173886
  H25: 0.242699
Submission shape: (1447107, 2)
                                       id  prediction
0   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3647   -0.005317
1  W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3647   -0.039086
2  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3647   -0.273217
3   W2MW3G2L__495MGHFJ__PZ9S1Z4V__1__3647   -0.007185
4  W2MW3G2L__495MGHFJ__PZ9S1Z4V__10__3648   -0.073280
5  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3648   -0.261707
6   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3648   -0.007679
7   W2MW3G2L__495MGHFJ__PZ9S1Z4V__1__3648   -0.007010
8   W2MW3G2L__495MGHFJ__PZ9S1Z4V__3__3649   -0.018594
9  W2MW3G2L__495MGHFJ__PZ9S1Z4V__25__3649   -0.248694
