In [61]:
import pandas as pd
import numpy as np

def create_time_series_splits(data, train_size_days, test_size_days, num_splits, window_size_steps, exclude_columns, target_column, prediction_horizon_steps, shifting_steps=None):
    """
    Creates train/test splits for a multivariate time series dataset with maintained column names
    and datetime indices corresponding to the prediction time.
    
    Parameters:
    - data (pd.DataFrame): The input time series data with 15-minute granularity.
                           Assumes that data has a DateTimeIndex.
    - train_size_days (int): Number of days for the training set in each split.
    - test_size_days (int): Number of days for the testing set in each split.
    - num_splits (int): Number of non-overlapping train/test splits to generate.
    - window_size_steps (int): Number of 15-minute steps to consider for lagged values (window size).
    - exclude_columns (list): List of columns to exclude from the training features.
    - target_column (str): The name of the target column.
    - prediction_horizon_steps (int): Number of 15-minute steps ahead to predict (prediction horizon).
    - shifting_steps (int or None): Number of 15-minute steps to skip between samples.
                                    If None, defaults to 1 (no skipping).

    Returns:
    - splits (list): A list containing dictionaries with keys 'X_train', 'Y_train', 'X_test', 'Y_test' for each split.
                     Each DataFrame has its index set to the datetime right before prediction.
    """

    steps_per_day = 96  # Number of 15-minute intervals in a day
    train_size_steps = train_size_days * steps_per_day
    test_size_steps = test_size_days * steps_per_day

    if shifting_steps is None:
        shifting_steps = 1

    split_size_steps = train_size_steps + test_size_steps

    # Calculate the total number of data points required
    total_data_steps_needed = split_size_steps * num_splits + window_size_steps + prediction_horizon_steps - 1

    if len(data) < total_data_steps_needed:
        raise ValueError('Not enough data for the specified number of splits and sizes.')

    splits = []

    for split_index in range(num_splits):
        # Calculate indices for train and test data
        split_start = split_index * split_size_steps
        train_start = split_start
        train_end = train_start + train_size_steps - 1

        test_start = train_end + 1
        test_end = test_start + test_size_steps - 1

        # Extract train and test data for the current split
        train_data = data.iloc[train_start : train_end + window_size_steps + prediction_horizon_steps]
        test_data = data.iloc[test_start : test_end + window_size_steps + prediction_horizon_steps]

        # Generate training samples
        X_train, Y_train = create_samples_with_datetime_index(train_data, window_size_steps, exclude_columns, target_column, prediction_horizon_steps, shifting_steps)

        # Generate testing samples
        X_test, Y_test = create_samples_with_datetime_index(test_data, window_size_steps, exclude_columns, target_column, prediction_horizon_steps, shifting_steps)

        # Append the current split to the list
        splits.append({
            'X_train': X_train,
            'Y_train': Y_train,
            'X_test': X_test,
            'Y_test': Y_test
        })

    return splits

def create_samples_with_datetime_index(data, window_size_steps, exclude_columns, target_column, prediction_horizon_steps, shifting_steps):
    """
    Generates samples for X and Y based on the window size and prediction horizon,
    maintaining column names with time step suffixes and setting the index to datetime.

    Parameters:
    - data (pd.DataFrame): The subset of data to create samples from.
                           Assumes that data has a DateTimeIndex.
    - window_size_steps (int): Number of steps in the window size.
    - exclude_columns (list): Columns to exclude from features.
    - target_column (str): The name of the target column.
    - prediction_horizon_steps (int): Number of steps ahead to predict.
    - shifting_steps (int): Number of steps to shift between samples.

    Returns:
    - X (pd.DataFrame): Feature DataFrame with maintained column names and datetime index.
    - Y (pd.DataFrame): Target DataFrame with datetime index.
    """

    # Ensure that the data index is a DateTimeIndex
    if not isinstance(data.index, pd.DatetimeIndex):
        raise ValueError("Data must have a DateTimeIndex.")

    # Determine feature columns by excluding specified columns and the target column
    feature_columns = [col for col in data.columns if col not in exclude_columns]

    max_t = len(data) - prediction_horizon_steps
    t_start = window_size_steps - 1
    t_end = max_t - 1

    t_values = list(range(t_start, t_end + 1, shifting_steps))

    X_list = []
    Y_list = []
    index_list = []

    for t in t_values:
        # Initialize an empty dictionary to store the features for this sample
        X_t = {}
        
        # Iterate over the time window
        for w in range(window_size_steps):
            time_step = t - window_size_steps + w + 1
            suffix = f"_t-{window_size_steps - w - 1}"
            for col in feature_columns:
                col_name = f"{col}{suffix}"
                X_t[col_name] = data.iloc[time_step][col]
        
        # Extract prediction horizon for current sample
        Y_t = data.iloc[t + 1 : t + 1 + prediction_horizon_steps][target_column].values.flatten()
        
        X_list.append(X_t)
        Y_list.append(Y_t)
        # Record the datetime index corresponding to time t (right before prediction at t+1)
        index_list.append(data.index[t])

    # Convert lists to DataFrames
    X = pd.DataFrame(X_list, index=index_list)
    Y = pd.DataFrame(Y_list, index=index_list, columns=[f"{target_column}_t+{i+1}" for i in range(prediction_horizon_steps)])

    return X, Y


# Usage

In [39]:
df = pd.read_csv('ods001.csv', sep=';', parse_dates=['Datetime'], index_col='Datetime')

# Transform index in Pandas Datetime format
df.index = pd.to_datetime(df.index, utc=True)

# Reverse ordering to go from top to bottom
df = df.iloc[::-1]

# Drop rows with missing values
df.dropna(inplace=True)

# Set columns to be excluded
to_remove = list(set(df.columns).difference({'Total Load', 'Most recent forecast','Day-ahead 6PM P10'}))

In [71]:
splits = create_time_series_splits(
    data=df,
    train_size_days=5,
    test_size_days=2,
    num_splits=3,
    window_size_steps=3,
    exclude_columns=to_remove,
    target_column='Total Load',
    prediction_horizon_steps=4,  # Predicting 1 hour ahead (4 * 15 minutes)
    shifting_steps=4  # Skipping every 1h interval
)

In [69]:
# Accessing the first split
first_split = splits[0]
X_train = first_split['X_train']
Y_train = first_split['Y_train']
X_test = first_split['X_test']
Y_test = first_split['Y_test']


In [82]:
# Check for non overlapping splits
print('Split 0 ends at', splits[0]['X_test'].index[-1], 'while split 1 starts at', splits[1]['X_train'].index[0])

Split 0 ends at 2015-01-07 22:30:00+00:00 while split 1 starts at 2015-01-07 23:30:00+00:00


In [84]:
X_train.head()

Unnamed: 0,Total Load_t-2,Most recent forecast_t-2,Day-ahead 6PM P10_t-2,Total Load_t-1,Most recent forecast_t-1,Day-ahead 6PM P10_t-1,Total Load_t-0,Most recent forecast_t-0,Day-ahead 6PM P10_t-0
2014-12-31 23:30:00+00:00,10142.19,9496.05,9076.64,10051.28,9329.17,8916.55,9952.87,9174.72,8767.84
2014-12-31 23:45:00+00:00,10051.28,9329.17,8916.55,9952.87,9174.72,8767.84,9821.78,9025.46,8625.97
2015-01-01 00:00:00+00:00,9952.87,9174.72,8767.84,9821.78,9025.46,8625.97,9755.0,9222.33,9777.87
2015-01-01 00:15:00+00:00,9821.78,9025.46,8625.97,9755.0,9222.33,9777.87,9575.73,9130.24,9679.61
2015-01-01 00:30:00+00:00,9755.0,9222.33,9777.87,9575.73,9130.24,9679.61,9494.09,8993.31,9534.23


In [86]:
Y_train.head()

Unnamed: 0,Total Load_t+1,Total Load_t+2,Total Load_t+3,Total Load_t+4
2014-12-31 23:30:00+00:00,9821.78,9755.0,9575.73,9494.09
2014-12-31 23:45:00+00:00,9755.0,9575.73,9494.09,9382.5
2015-01-01 00:00:00+00:00,9575.73,9494.09,9382.5,9349.16
2015-01-01 00:15:00+00:00,9494.09,9382.5,9349.16,9213.16
2015-01-01 00:30:00+00:00,9382.5,9349.16,9213.16,9069.42


In [85]:
X_test.head()

Unnamed: 0,Total Load_t-2,Most recent forecast_t-2,Day-ahead 6PM P10_t-2,Total Load_t-1,Most recent forecast_t-1,Day-ahead 6PM P10_t-1,Total Load_t-0,Most recent forecast_t-0,Day-ahead 6PM P10_t-0
2015-01-05 23:30:00+00:00,11193.52,10878.35,10194.09,11118.0,10680.51,10008.69,11071.16,10514.3,9852.94
2015-01-05 23:45:00+00:00,11118.0,10680.51,10008.69,11071.16,10514.3,9852.94,10890.92,10333.23,9683.26
2015-01-06 00:00:00+00:00,11071.16,10514.3,9852.94,10890.92,10333.23,9683.26,10798.69,10250.93,9216.03
2015-01-06 00:15:00+00:00,10890.92,10333.23,9683.26,10798.69,10250.93,9216.03,10732.07,10079.16,9060.71
2015-01-06 00:30:00+00:00,10798.69,10250.93,9216.03,10732.07,10079.16,9060.71,10651.88,9902.39,8901.42


In [87]:
Y_test.head()

Unnamed: 0,Total Load_t+1,Total Load_t+2,Total Load_t+3,Total Load_t+4
2015-01-05 23:30:00+00:00,10890.92,10798.69,10732.07,10651.88
2015-01-05 23:45:00+00:00,10798.69,10732.07,10651.88,10443.55
2015-01-06 00:00:00+00:00,10732.07,10651.88,10443.55,10397.17
2015-01-06 00:15:00+00:00,10651.88,10443.55,10397.17,10322.57
2015-01-06 00:30:00+00:00,10443.55,10397.17,10322.57,10185.42
