In [1]:
import os
import pandas as pd
from datetime import timedelta

def filter_date_range(file_path, start_date, end_date):
    data = pd.read_csv(file_path, index_col=0, parse_dates=True)
    return data.loc[start_date:end_date]

def save_filtered_data(source_dir, target_dir, start_date, end_date):
    os.makedirs(target_dir, exist_ok=True)
    for filename in os.listdir(source_dir):
        if filename.endswith('.csv'):
            file_path = os.path.join(source_dir, filename)
            selected_data = filter_date_range(file_path, start_date, end_date)
            selected_data.to_csv(os.path.join(target_dir, filename))

def generate_subsets(min_date, max_date, n_subsets=14):
    """Rolling 2-year train + 4-month val + 4-month test windows."""
    subsets = []
    min_date = pd.Timestamp(min_date)
    max_date = pd.Timestamp(max_date)

    total_days = (max_date - min_date).days
    step = total_days // n_subsets  # forward shift per subset

    for i in range(n_subsets):
        train_start = min_date + timedelta(days=i * step)
        train_end = train_start + timedelta(days=730)  # ≈ 2 years train
        val_start = train_end + timedelta(days=1)
        val_end = val_start + timedelta(days=120)      # ≈ 4 months val
        test_start = val_end + timedelta(days=1)
        test_end = test_start + timedelta(days=120)    # ≈ 4 months test

        if test_end > max_date:
            break

        subsets.append({
            'Subset': i + 1,
            'Train Start': train_start.strftime('%Y-%m-%d'),
            'Train End': train_end.strftime('%Y-%m-%d'),
            'Val Start': val_start.strftime('%Y-%m-%d'),
            'Val End': val_end.strftime('%Y-%m-%d'),
            'Test Start': test_start.strftime('%Y-%m-%d'),
            'Test End': test_end.strftime('%Y-%m-%d')
        })
    return pd.DataFrame(subsets)

def main_dynamic():
    label_source_path = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\LABEL0.csv"
    alpha_source_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\Alpha360"
    target_base_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\splits_SP500"

    min_date = '2015-10-05'
    max_date = '2025-10-03'

    subsets = generate_subsets(min_date, max_date)
    print(subsets)

    for _, row in subsets.iterrows():
        start_date = row['Train Start']
        end_date = row['Test End']
        subset_id = row['Subset']

        target_folder_name = f"SP500_{start_date}_{end_date}"
        target_dir = os.path.join(target_base_dir, target_folder_name)
        os.makedirs(target_dir, exist_ok=True)

        label_data = filter_date_range(label_source_path, start_date, end_date)
        label_data.to_csv(os.path.join(target_dir, 'label.csv'))

        alpha_target_dir = os.path.join(target_dir, f"Alpha_360_{start_date}_{end_date}")
        save_filtered_data(alpha_source_dir, alpha_target_dir, start_date, end_date)

        print(f"Subset {subset_id} saved → {target_dir}")

if __name__ == "__main__":
    main_dynamic()


    Subset Train Start   Train End   Val Start     Val End  Test Start  \
0        1  2015-10-05  2017-10-04  2017-10-05  2018-02-02  2018-02-03   
1        2  2016-06-21  2018-06-21  2018-06-22  2018-10-20  2018-10-21   
2        3  2017-03-08  2019-03-08  2019-03-09  2019-07-07  2019-07-08   
3        4  2017-11-23  2019-11-23  2019-11-24  2020-03-23  2020-03-24   
4        5  2018-08-10  2020-08-09  2020-08-10  2020-12-08  2020-12-09   
5        6  2019-04-27  2021-04-26  2021-04-27  2021-08-25  2021-08-26   
6        7  2020-01-12  2022-01-11  2022-01-12  2022-05-12  2022-05-13   
7        8  2020-09-28  2022-09-28  2022-09-29  2023-01-27  2023-01-28   
8        9  2021-06-15  2023-06-15  2023-06-16  2023-10-14  2023-10-15   
9       10  2022-03-02  2024-03-01  2024-03-02  2024-06-30  2024-07-01   
10      11  2022-11-17  2024-11-16  2024-11-17  2025-03-17  2025-03-18   

      Test End  
0   2018-06-03  
1   2019-02-18  
2   2019-11-05  
3   2020-07-22  
4   2021-04-08  
5   2021-