In [1]:
# ---------------------------------------------
# Prepare time series data for PyTorch models. 
# ---------------------------------------------
import sys
import os
sys.path.append(os.path.abspath('../scripts'))

from functions import time_split, scale_series, create_windows, to_tensor, load_data
from imports import *
from config import *

# ---------------------------------------------
# Load  Data
# ---------------------------------------------
df = load_data(DATA_PATH)

# -------------------------------
# Main Processing Pipeline
# -------------------------------

def preprocess_data(df: pd.DataFrame, WINDOW_SIZE : int, STEP_AHEAD : int ) -> dict[str, tuple[torch.Tensor, torch.Tensor]]:
    """
    Complete preprocessing pipeline:
    Splits data → Scales → Windows → Converts to PyTorch Tensors
    """
    train, valid, test = time_split(df, TRAIN_MONTHS, VALID_MONTHS)
    train_scaled, valid_scaled, test_scaled, scaler = scale_series(train, valid, test, TARGET_COL)

    X_train, y_train = create_windows(train_scaled, WINDOW_SIZE, STEP_AHEAD)
    X_valid, y_valid = create_windows(valid_scaled, WINDOW_SIZE, STEP_AHEAD)
    X_test, y_test = create_windows(test_scaled, WINDOW_SIZE, STEP_AHEAD)

    X_train_t, y_train_t = to_tensor(X_train, y_train)
    X_valid_t, y_valid_t = to_tensor(X_valid, y_valid)
    X_test_t, y_test_t = to_tensor(X_test, y_test)

    return {
        'train': (X_train_t, y_train_t),
        'valid': (X_valid_t, y_valid_t),
        'test': (X_test_t, y_test_t)
    }, scaler
    
# Then call processing function
processed_dict, scaler = preprocess_data(df, WINDOW_SIZE, STEP_AHEAD)

# Sanity check
for split, (X, y) in processed_dict.items():
    print(f"{split.upper()} -> X: {X.shape}, y: {y.shape}")

TRAIN -> X: torch.Size([39294, 18, 1]), y: torch.Size([39294, 1])
VALID -> X: torch.Size([8766, 18, 1]), y: torch.Size([8766, 1])
TEST -> X: torch.Size([4446, 18, 1]), y: torch.Size([4446, 1])


In [2]:
# Save dict
clean_prepped_dataset = {'dataset': processed_dict,
                            'scaler': scaler}
# Save results
os.makedirs(DATA_DIR, exist_ok=True)

with open(f'{DATA_DIR}/clean_prepped_dataset.pkl', 'wb') as f:
    pickle.dump(clean_prepped_dataset, f)