In [None]:
import modin.pandas as mpd
import pandas as pd
from statsmodels.datasets import macrodata

from temporalscope.core.temporal_data_loader import TimeFrame as tf
from temporalscope.core.utils import print_divider
from temporalscope.partition.sliding_window import SlidingWindowPartitioner as SWP


def load_macrodata(target_col: str = "realgdp"):
    """Preprocess the dataset with a combined column for time & shifted target.

    :param target_col: The column to be used as the target for prediction.
                       Defaults to 'realgdp'.

    :type target_col: str, optional

    :returns: Preprocessed DataFrame with shifted target.
    :rtype: pd.DataFrame
    """
    print_divider()
    print("Loading the 'macrodata' dataset from the open-license statsmodels package.")
    print(f"Using '{target_col}' as the target column for future prediction.")
    print_divider()

    # Load macrodata dataset
    macro_df = macrodata.load_pandas().data.copy()

    # Create 'ds' column by combining 'year' and 'quarter'
    macro_df["ds"] = pd.to_datetime(
        macro_df["year"].astype(int).astype(str)
        + "-"
        + ((macro_df["quarter"] - 1) * 3 + 1).astype(int).astype(str)
        + "-01"
    )

    # Drop the 'year' and 'quarter' columns
    macro_df = macro_df.drop(columns=["year", "quarter"])

    # Reorder columns to place 'ds' first
    cols = ["ds"] + [col for col in macro_df.columns if col != "ds"]
    macro_df = macro_df[cols].copy()

    # Shift the target column for future prediction and rename it
    shifted_target_col = f"target_{target_col}"
    macro_df[shifted_target_col] = macro_df[target_col].shift(-1)

    # Drop any rows with NaN (due to shifting)
    macro_df = macro_df.dropna().copy()

    # Print the shape of the DataFrame
    print(f"Loaded DataFrame shape: {macro_df.shape}")

    print_divider()
    print(
        f"""Shifted '{target_col}' to create a new target column '{shifted_target_col}'
        for future prediction."""
    )
    print_divider()

    return macro_df, shifted_target_col


if __name__ == "__main__":
    # Load the macrodata dataset and preprocess
    macro_df, shifted_target_col = load_macrodata()

    # Initialize the TimeFrame using the Modin backend
    print_divider()
    print("Using Modin backend for Sliding Window Partitioning:")
    macro_modin_df = mpd.DataFrame(macro_df)
    macro_modin_tf = tf(
        macro_modin_df, time_col="ds", target_col=shifted_target_col, backend="mpd"
    )

    print("Preview of the Modin DataFrame (macrodata):")
    print(macro_modin_tf.get_data().head())
    print_divider()

    # Initialize SlidingWindowPartitioner with the TimeFrame object
    print("Applying Sliding Window Partitioner:")

    partitioner = SWP(
        tf=macro_modin_tf,  # TimeFrame object
        window_size=20,  # Fixed window size of 20
        stride=10,  # Step size between windows of 10
        truncate=True,  # Skip the last partition if it doesn't meet the window size
        expand_last=False,  # Do not expand the last partition to match the window size
        enable_warnings=True,  # Enable warnings for uneven partitions
    )

    # Get the partition indices dictionary.
    # 60% training, 20% testing, and 20% validation splits
    partitions_dict = partitioner.get_partitions_indices_dict(
        train_pct=0.6, test_pct=0.2, val_pct=0.2
    )

    # Print the partitioned indices
    print("Partitioned Indices with 60% train, 20% test, and 20% validation split:")
    for partition_name, partition_indices in partitions_dict.items():
        print(f"{partition_name}: {partition_indices}")
    print_divider()

    # Verify the partitions by printing the training data from the first partition
    print("Training data of the first partition:")
    first_partition = partitioner.apply_partition(
        partitions_dict["partition_1"]["train"]
    )
    print(first_partition)
    print_divider()

    # Verify the test data from the first partition
    print("Test data of the first partition:")
    test_partition = partitioner.apply_partition(partitions_dict["partition_1"]["test"])
    print(test_partition)
    print_divider()

    # Verify the validation data from the first partition
    print("Validation data of the first partition:")
    validation_partition = partitioner.apply_partition(
        partitions_dict["partition_1"]["validation"]
    )
    print(validation_partition)
    print_divider()