In [2]:
from gift_eval.data import Dataset

import os
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
load_dotenv()

# Get the GIFT_EVAL path from environment variables
gift_eval_path = os.getenv("GIFT_EVAL")

if gift_eval_path:
    # Convert to Path object for easier manipulation
    gift_eval_path = Path(gift_eval_path)

    # Get all subdirectories (dataset names) in the GIFT_EVAL path
    dataset_names = []
    for dataset_dir in gift_eval_path.iterdir():
        if dataset_dir.name.startswith("."):
            continue
        if dataset_dir.is_dir():
            freq_dirs = [d for d in dataset_dir.iterdir() if d.is_dir()]
            if freq_dirs:
                for freq_dir in freq_dirs:
                    dataset_names.append(f"{dataset_dir.name}/{freq_dir.name}")
            else:
                dataset_names.append(dataset_dir.name)

    print("Available datasets in GIFT_EVAL:")
    for name in sorted(dataset_names):
        print(f"- {name}")
else:
    print(
        "GIFT_EVAL path not found in environment variables. Please check your .env file."
    )

Available datasets in GIFT_EVAL:
- LOOP_SEATTLE/5T
- LOOP_SEATTLE/D
- LOOP_SEATTLE/H
- M_DENSE/D
- M_DENSE/H
- SZ_TAXI/15T
- SZ_TAXI/H
- bitbrains_fast_storage/5T
- bitbrains_fast_storage/H
- bitbrains_rnd/5T
- bitbrains_rnd/H
- bizitobs_application
- bizitobs_l2c/5T
- bizitobs_l2c/H
- bizitobs_service
- car_parts_with_missing
- covid_deaths
- electricity/15T
- electricity/D
- electricity/H
- electricity/W
- ett1/15T
- ett1/D
- ett1/H
- ett1/W
- ett2/15T
- ett2/D
- ett2/H
- ett2/W
- hierarchical_sales/D
- hierarchical_sales/W
- hospital
- jena_weather/10T
- jena_weather/D
- jena_weather/H
- kdd_cup_2018_with_missing/D
- kdd_cup_2018_with_missing/H
- m4_daily
- m4_hourly
- m4_monthly
- m4_quarterly
- m4_weekly
- m4_yearly
- restaurant
- saugeenday/D
- saugeenday/M
- saugeenday/W
- solar/10T
- solar/D
- solar/H
- solar/W
- temperature_rain_with_missing
- us_births/D
- us_births/M
- us_births/W


In [3]:
from gluonts.dataset.util import to_pandas
import matplotlib.pyplot as plt

ds_name = "m4_monthly"  # Name of the dataset
to_univariate = False  # Whether to convert the data to univariate
term = "short"  # Term of the dataset

dataset = Dataset(name=ds_name, term=term, to_univariate=to_univariate)
print("Dataset frequency: ", dataset.freq)
print("Prediction length: ", dataset.prediction_length)
print("Number of windows in the rolling evaluation: ", dataset.windows)

Dataset frequency:  M
Prediction length:  18
Number of windows in the rolling evaluation:  1


In [29]:
import json


short_datasets = "m4_yearly m4_quarterly m4_monthly m4_weekly m4_daily m4_hourly electricity/15T electricity/H electricity/D electricity/W solar/10T solar/H solar/D solar/W hospital covid_deaths us_births/D us_births/M us_births/W saugeenday/D saugeenday/M saugeenday/W temperature_rain_with_missing kdd_cup_2018_with_missing/H kdd_cup_2018_with_missing/D car_parts_with_missing restaurant hierarchical_sales/D hierarchical_sales/W LOOP_SEATTLE/5T LOOP_SEATTLE/H LOOP_SEATTLE/D SZ_TAXI/15T SZ_TAXI/H M_DENSE/H M_DENSE/D ett1/15T ett1/H ett1/D ett1/W ett2/15T ett2/H ett2/D ett2/W jena_weather/10T jena_weather/H jena_weather/D bitbrains_fast_storage/5T bitbrains_fast_storage/H bitbrains_rnd/5T bitbrains_rnd/H bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
#short_datasets = "m4_weekly"

med_long_datasets = "electricity/15T electricity/H solar/10T solar/H kdd_cup_2018_with_missing/H LOOP_SEATTLE/5T LOOP_SEATTLE/H SZ_TAXI/15T M_DENSE/H ett1/15T ett1/H ett2/15T ett2/H jena_weather/10T jena_weather/H bitbrains_fast_storage/5T bitbrains_rnd/5T bizitobs_application bizitobs_service bizitobs_l2c/5T bizitobs_l2c/H"
# med_long_datasets = "bizitobs_l2c/H"

# Get union of short and med_long datasets
all_datasets = list(set(short_datasets.split() + med_long_datasets.split()))

dataset_properties_map = json.load(open("notebooks/dataset_properties.json"))

In [None]:
all_freqs = ['10S',
 '10T',
 '15T',
 '5T',
 'A-DEC',
 'D',
 'H',
 'M',
 'Q-DEC',
 'W-FRI',
 'W-SUN',
 'W-THU',
 'W-TUE',
 'W-WED'
]

prediction_len = {
    "15min": 48 * 4,
    "30min": 48 * 2,
    "5min": 48 * 12,
    "h": 48,
    "D": 14,
    "W-SUN": 13,
    "ME": 12,
    "QE-OCT": 8,
    "YE-DEC": 6,
}

freqs_sesonal_forecast_lag = {
    "D": 7,
    "W-SUN": 1,
    "ME": 1,
    "QE-OCT": 4,
    "YE-DEC": 1,
    "15min": 4 * 24 * 7,
    "30min": 2 * 24 * 7,
    "5min": 12 * 24 * 7,
    "h": 24 * 7,
}

In [32]:
ds_freqs = set()
from collections import defaultdict
ds_freqs_pred_len = defaultdict(list)
for ds_name in all_datasets:
    
    dataset = Dataset(name=ds_name, term="short", to_univariate=False)
    
    ds_freqs.add(dataset.freq)
    ds_freqs_pred_len[dataset.freq].append(dataset.prediction_length)


In [55]:
freq_x_horizon = {'10T': 48,
'H': 48,
'5T': 48,
'W-SUN': 13,
'10S': 60,
'D': 14,
'W-FRI': 8,
'M': 12,
'Q-DEC': 8,
'15T': 48,
'W-THU': 8,
'A-DEC': 6,
'W-WED': 8,
'W-TUE': 8
}

freqs_sesonal_forecast_lag = {
    '10T': 24 * 6,
    'H': 24,
    '5T': 24 * 2,
    'W-SUN': 1,
    '10S': 60,
    'D': 7,
    'W-FRI': 1,
    'M': 1,
    'Q-DEC': 4,
    '15T': 24 * 4,
    'W-THU': 1,
    'A-DEC': 1,
    'W-WED': 1,
    'W-TUE': 1
}

In [88]:
import pandas as pd
import hashlib

for dataset in all_datasets:
    dataset_nm = dataset
    dataset = Dataset(name=dataset, term="short", to_univariate=False)
    freq = dataset.freq
    ds = dataset.hf_dataset
    ds = ds.filter(lambda x: len(x['target']) > max(freqs_sesonal_forecast_lag[freq] * 3, 3 * freq_x_horizon[freq]))
    ds = ds.filter(lambda x: not pd.isnull(x['target']).any())
    try:
        splits_ = ds.train_test_split(test_size=0.5, train_size=0.5)
    except ValueError:
        continue
    public = splits_['train'].to_pandas()
    private = splits_['test'].to_pandas()
    if len(public) == 0 or len(private) == 0:
        continue
    public["id"] = public["item_id"].apply(
        lambda x: hashlib.md5(f"{dataset_nm}_{x}".encode()).hexdigest()
    )
    private["id"] = private["item_id"].apply(
        lambda x: hashlib.md5(f"{dataset_nm}_{x}".encode()).hexdigest()
    )
    
    public["target"] = public["target"] / 42
    private["target"] = private["target"] / 42
    public["forecast"] = public.apply(lambda x: x["target"][freq_x_horizon[x["freq"]]:], axis=1)
    private["forecast"] = private.apply(lambda x: x["target"][freq_x_horizon[x["freq"]]:], axis=1)
    public["target"] = public.apply(lambda x: x["target"][:freq_x_horizon[x["freq"]]], axis=1)
    private["target"] = private.apply(lambda x: x["target"][:freq_x_horizon[x["freq"]]], axis=1)
    
    public.drop(columns=["item_id"], inplace=True)
    private.drop(columns=["item_id"], inplace=True)
    public["Usage"] = "Public"
    private["Usage"] = "Private"
    
    public.to_parquet(f"data/{dataset_nm.replace('/', '_')}_public.parquet.gz", index=False, compression="gzip")
    private.to_parquet(f"data/{dataset_nm.replace('/', '_')}_private.parquet.gz", index=False, compression="gzip")

In [85]:
public = splits_['train'].to_pandas()

In [87]:
splits_

DatasetDict({
    train: Dataset({
        features: ['item_id', 'start', 'freq', 'target'],
        num_rows: 0
    })
    test: Dataset({
        features: ['item_id', 'start', 'freq', 'target'],
        num_rows: 0
    })
})

In [89]:
import pandas as pd    
df = pd.read_parquet('data')

In [90]:
df.head()

Unnamed: 0,start,freq,target,id,forecast,Usage
0,2015-01-01,5T,"[1.5849482, 1.4035485, 1.3493197, 1.3783904, 1...",8494d7b9e6065bb3f5bcf0e318120e77,"[1.5783525, 1.5205361, 1.4161543, 1.5990915, 1...",Private
1,2015-01-01,5T,"[1.4048808, 1.4585025, 1.4644703, 1.4722879, 1...",82e777a3a69ea6e8f2ab01a5a1ab866b,"[1.4978822, 1.4666873, 1.2250957, 1.3937348, 1...",Private
2,2015-01-01,5T,"[1.5539262, 1.5621126, 1.5718371, 1.5651293, 1...",15040ed886a9e57e84a561abf740f300,"[1.587172, 1.6263217, 1.5464014, 1.5335662, 1....",Private
3,2015-01-01,5T,"[1.5233979, 1.4773768, 1.5138217, 1.4794972, 1...",77650a25b0b71a7ebee355976e336a16,"[1.501787, 1.4406147, 1.420885, 1.4769669, 1.5...",Private
4,2015-01-01,5T,"[1.3978602, 1.2957004, 1.4033904, 1.4324802, 1...",816c8f8cb3d6bf21b18e70bebc75971e,"[1.4514503, 1.3340079, 1.2038977, 1.1347616, 1...",Private


In [91]:
df[["id", "freq", "start", "target"]].to_parquet("data/train.parquet.gz", index=False, compression="gzip")

In [92]:
df[["id", "freq", "start", "target", "forecast", "Usage"]].to_parquet("data/private.parquet.gz", index=False, compression="gzip")