In [5]:
import pandas as pd
from types import GeneratorType

class NestedCV:
    def __init__(self, k):
        self.k = k

    def split(self, data, date_column):
        # Sort data by date
        data = data.sort_values(by=date_column)
        
        # Calculate fold size
        fold_size = len(data) // self.k
        
        # Generate train-validate splits
        for i in range(self.k):
            start_idx = i * fold_size
            end_idx = min(start_idx + fold_size, len(data))
            
            validate = data.iloc[start_idx:end_idx]
            train = pd.concat([data.iloc[:start_idx], data.iloc[end_idx:]], ignore_index=True)

            
            yield train, validate

if __name__ == "__main__":
    # load dataset
    data = pd.read_csv("path to dataset")
    data["date"] = pd.to_datetime(data["date"])

    # nested cv
    k = 3
    cv = NestedCV(k)
    splits = cv.split(data, "date")

    # check return type
    assert isinstance(splits, GeneratorType)

    # check return types, shapes, and data leaks
    count = 0
    for train, validate in splits:
        # types
        assert isinstance(train, pd.DataFrame)
        assert isinstance(validate, pd.DataFrame)

        # shape
        assert train.shape[1] == validate.shape[1]

        # data leak
        print(train["date"].max() <= validate["date"].min())

        count += 1

    # check number of splits returned
    assert count == k


False
False
True
