# TimeSeriesHoldout Example

This notebook demonstrates how to use the `TimeSeriesHoldout` class to split time series data into train and test sets, preserving time order.

In [1]:
import pandas as pd
from ThreeWToolkit.holdout import TimeSeriesHoldout

df = pd.DataFrame({
    "date": pd.date_range("2024-01-01", periods=5, freq="D"),
    "sensor1": [10, 20, 30, 40, 50],
    "sensor2": [100, 90, 80, 70, 60]
})

df


Unnamed: 0,date,sensor1,sensor2
0,2024-01-01,10,100
1,2024-01-02,20,90
2,2024-01-03,30,80
3,2024-01-04,40,70
4,2024-01-05,50,60


In [2]:
config = {}

ts_holdout = TimeSeriesHoldout(data=df, pip_config=config)


### Split the data using default behavior (e.g., test_size=0.3)

In [17]:

train_df, test_df = ts_holdout.train_test_split(test_size=0.3)

print("Train Set:")
display(train_df)

print("Test Set:")
display(test_df)


Train Set:


Unnamed: 0,date,sensor1,sensor2
0,2024-01-01,10,100
1,2024-01-02,20,90
2,2024-01-03,30,80


Test Set:


Unnamed: 0,date,sensor1,sensor2
3,2024-01-04,40,70
4,2024-01-05,50,60


### Shuffle example

In [18]:

train_df, test_df = ts_holdout.train_test_split(test_size=0.3, shuffle=True)

print("Train Set:")
display(train_df)

print("Test Set:")
display(test_df)


Train Set:


Unnamed: 0,date,sensor1,sensor2
2,2024-01-03,30,80
0,2024-01-01,10,100
4,2024-01-05,50,60


Test Set:


Unnamed: 0,date,sensor1,sensor2
1,2024-01-02,20,90
3,2024-01-04,40,70


### Use only the 'value' column explicitly

In [8]:
train_series, test_series = ts_holdout.train_test_split(df["sensor1"], test_size=0.4)

print("Train Series:")
display(train_series)

print("Test Series:")
display(test_series)


Train Series:


0    10
1    20
2    30
Name: sensor1, dtype: int64

Test Series:


3    40
4    50
Name: sensor1, dtype: int64

### Demonstrate stratify function

Both train and test must maintain the same proportion

In [4]:
df = pd.DataFrame({
    'feature': range(100),
    'label': [0]*65 + [1]*35
})

config = {
    'test_size': 0.2,
    'shuffle': True,
    'random_state': 42,
    'stratify': df['label']
}

splitter = TimeSeriesHoldout(data=df, pip_config=config)
train_df, test_df = splitter.train_test_split(df)

In [7]:
def print_label_percentages(data, title):
    total = len(data)
    percentages = data['label'].value_counts(normalize=True).sort_index() * 100
    print(f'\n{title}:')
    for label, pct in percentages.items():
        print(f'Class {label}: {pct:.2f}%')

print_label_percentages(df, 'Original')
print_label_percentages(train_df, 'Train')
print_label_percentages(test_df, 'Test')



Original:
Class 0: 65.00%
Class 1: 35.00%

Train:
Class 0: 65.00%
Class 1: 35.00%

Test:
Class 0: 65.00%
Class 1: 35.00%
