In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import pickle
import datasets
import numpy as np
import pandas as pd
from statsmodels.tsa.seasonal import STL

In [None]:
period_map = {
    'ercot': 24,                      # Daily cycle (hourly data)
    'm4_daily': 7,                    # Weekly cycle (daily data)
    'm4_hourly': 24,                  # Daily cycle (hourly data)
    'm4_monthly': 12,                 # Yearly cycle (monthly data)
    'm4_quarterly': 4,                # Quarterly cycle within a year
    'm4_yearly':2,
    'm4_weekly': 52,                  # Yearly cycle (weekly data)
    'mexico_city_bikes': 24,          # Daily cycle (hourly bike usage)
    'monash_australian_electricity': 24,   # Daily cycle (hourly electricity usage)
    'monash_car_parts': 12,           # Yearly cycle (likely monthly data)
    'monash_cif_2016': 7,             # Weekly cycle (daily data)
    'monash_covid_deaths': 7,         # Weekly cycle (daily COVID data)
    'monash_electricity_hourly': 24,  # Daily cycle (hourly electricity data)
    'monash_electricity_weekly': 52,  # Yearly cycle (weekly data)
    'monash_fred_md': 12,             # Yearly cycle (monthly data)
    'monash_hospital': 7,             # Weekly cycle (daily data)
    'monash_kdd_cup_2018': 24,        # Daily cycle (hourly data)
    'monash_london_smart_meters': 24, # Daily cycle (hourly meter readings)
    'monash_m1_monthly': 12,          # Yearly cycle (monthly data)
    'monash_m1_quarterly': 4,         # Quarterly cycle within a year
    'monash_m1_yearly': 2,            # Set to 2 to meet minimum (originally yearly)
    'monash_m3_monthly': 12,          # Yearly cycle (monthly data)
    'monash_m3_quarterly': 4,         # Quarterly cycle within a year
    'monash_m3_yearly': 2,            # Set to 2 to meet minimum (originally yearly)
    'monash_nn5_weekly': 52,          # Yearly cycle (weekly data)
    'monash_pedestrian_counts': 7,    # Weekly cycle (daily pedestrian counts)
    'monash_rideshare': 24,           # Daily cycle (hourly rideshare data)
    'monash_saugeenday': 7,           # Weekly cycle (daily water levels)
    'monash_temperature_rain': 30,   # Yearly cycle (daily temperature/rainfall)
    'monash_tourism_monthly': 12,     # Yearly cycle (monthly tourism data)
    'monash_tourism_quarterly': 4,    # Quarterly cycle within a year
    'monash_tourism_yearly': 2,       # Set to 2 to meet minimum (originally yearly)
    'monash_weather': 30,            # Yearly cycle (daily weather data)
    'nn5': 7,                         # Weekly cycle (daily data)
    'ushcn_daily': 30,               # Yearly cycle (daily climate data)
    'wind_farms_daily': 7,            # Weekly cycle (daily wind data)
    'wind_farms_hourly': 24           # Daily cycle (hourly wind data)
}


In [None]:
all_names = [
    'ercot', 'm4_daily', 'm4_hourly', 'm4_monthly', 'm4_quarterly', 'm4_yearly','m4_weekly',
    'mexico_city_bikes', 'monash_australian_electricity', 'monash_car_parts', 'monash_cif_2016', 'monash_covid_deaths',
    'monash_electricity_hourly', 'monash_electricity_weekly', 'monash_fred_md', 'monash_hospital', 'monash_kdd_cup_2018',
    'monash_london_smart_meters', 'monash_m1_monthly', 'monash_m1_quarterly', 'monash_m1_yearly', 'monash_m3_monthly',
    'monash_m3_quarterly', 'monash_m3_yearly', 'monash_nn5_weekly', 'monash_pedestrian_counts', 'monash_rideshare',
    'monash_saugeenday', 'monash_temperature_rain', 'monash_tourism_monthly', 'monash_tourism_quarterly',
    'monash_tourism_yearly', 'monash_weather', 'nn5', 'ushcn_daily', 'wind_farms_daily', 'wind_farms_hourly'
]

In [None]:

datasets_dict = {}

def read_dataset(dataset_name):
    print(f"Loading dataset: {dataset_name}")
    try:
        ds = datasets.load_dataset("autogluon/chronos_datasets", dataset_name, split="train")
        ds.set_format("numpy")
        return ds
    except Exception as e:
        print(f"Failed to load dataset {dataset_name}: {e}")
        return None

In [None]:
def perform_stl_decomposition(series, dataset_name):
    period = period_map.get(dataset_name, 24)
    stl = STL(series, period=period)
    result = stl.fit()
    return result.trend, result.seasonal, result.resid

In [None]:
def create_sliding_windows(dataset_name, ds, window_size=70, x_size=60, y_size=10):
    windows_dict = {}
    for entry in ds:
        data_id = entry['id']
        timestamps = entry['timestamp']
        targets = entry['target']

        windows_list = []

        for i in range(0, len(targets) - window_size + 1, window_size):
            window_timestamps = timestamps[i:i + window_size]
            window_targets = targets[i:i + window_size]

            x = {'timestamp': window_timestamps[:x_size], 'target': window_targets[:x_size]}
            y = {'timestamp': window_timestamps[x_size:], 'target': window_targets[x_size:]}


            x_trend, x_seasonal, x_resid = perform_stl_decomposition(x['target'], dataset_name)
            y_trend, y_seasonal, y_resid = perform_stl_decomposition(y['target'], dataset_name)

            windows_list.append({
                'x': x, 'y': y,
                'x_trend': x_trend, 'x_seasonal': x_seasonal, 'x_resid': x_resid,
                'y_trend': y_trend, 'y_seasonal': y_seasonal, 'y_resid': y_resid
            })

        windows_dict[data_id] = windows_list

    return windows_dict

In [None]:
all_data_windows = {}

for dataset_name in all_names:
    ds = read_dataset(dataset_name)
    if ds is not None:
        sliding_windows = create_sliding_windows(dataset_name, ds)
        all_data_windows[dataset_name] = sliding_windows


with open("all_datasets_windows.pkl", "wb") as f:
    pickle.dump(all_data_windows, f)

print("All datasets processed and saved to all_datasets_windows.pkl")

Loading dataset: ercot
Loading dataset: m4_daily
Loading dataset: m4_hourly
Loading dataset: m4_monthly
Loading dataset: m4_quarterly
Loading dataset: m4_yearly
Loading dataset: m4_weekly
Loading dataset: mexico_city_bikes


train-00000-of-00001.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/494 [00:00<?, ? examples/s]

Loading dataset: monash_australian_electricity


train-00000-of-00001.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loading dataset: monash_car_parts


train-00000-of-00001.parquet:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2674 [00:00<?, ? examples/s]

Loading dataset: monash_cif_2016


train-00000-of-00001.parquet:   0%|          | 0.00/70.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/72 [00:00<?, ? examples/s]

Loading dataset: monash_covid_deaths


train-00000-of-00001.parquet:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/266 [00:00<?, ? examples/s]

Loading dataset: monash_electricity_hourly


train-00000-of-00001.parquet:   0%|          | 0.00/31.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/321 [00:00<?, ? examples/s]

Loading dataset: monash_electricity_weekly


train-00000-of-00001.parquet:   0%|          | 0.00/334k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/321 [00:00<?, ? examples/s]

Loading dataset: monash_fred_md


train-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/107 [00:00<?, ? examples/s]

Loading dataset: monash_hospital


train-00000-of-00001.parquet:   0%|          | 0.00/117k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/767 [00:00<?, ? examples/s]

Loading dataset: monash_kdd_cup_2018


train-00000-of-00001.parquet:   0%|          | 0.00/8.78M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/270 [00:00<?, ? examples/s]

Loading dataset: monash_london_smart_meters


train-00000-of-00003.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5560 [00:00<?, ? examples/s]