# 00 - Data cleaning and fold generationThis notebook ingests raw Kaggle data from `data/01_raw`, applies light cleaning, and materializes fold definitions for all downstream model notebooks.

In [None]:
from pathlib import Pathimport pandas as pdfrom utils import (    TimeSeriesPreprocessor,    create_chronological_folds,    default_catalog,    save_folds_to_disk,    save_table,)catalog = default_catalog()train_path = catalog["train_raw"]test_path = catalog["test_raw"]preprocessor = TimeSeriesPreprocessor()

## Load raw dataThe Kedro-style catalog keeps paths centralized.

In [None]:
if train_path.exists():    train_df, test_df = preprocessor.load_data(train_path, test_path)    print(train_df.head())else:    train_df, test_df = None, None    print(f"No raw data found at {train_path}. Add files to data/01_raw.")

## Clean and engineer featuresUse the preprocessor to generate lag/rolling windows and fill missing values.

In [None]:
if train_df is not None:    base_features = preprocessor.create_all_features(train_df, target_col='target', lags=[1,2,3,5,7,14], windows=[7,14,30])    clean_df = base_features.fillna(method='ffill').dropna()    save_table(clean_df, catalog['clean_train'])    print(f"Saved cleaned training data -> {catalog['clean_train']}")else:    print('Skipping feature generation because training data is missing.')

## Chronological foldsCreate ten splits to be reused across all model notebooks.

In [None]:
if train_df is not None:    folds = create_chronological_folds(clean_df, n_splits=10, date_col='date')    save_folds_to_disk(folds, catalog['folds_dir'])else:    print('Cannot create folds without cleaned data.')