In [2]:
import sys
import os

# Go up one level to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [10]:
import numpy as np
import pandas as pd
from datetime import datetime

def read_tsf(
    file_path,
    parse_timestamps=False,
    timestamp_format="%Y-%m-%d"
):
    header = {}
    data_started = False
    series_list = []
    labels = []
    timestamps_list = [] if parse_timestamps else None

    with open(file_path, 'r') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            if line.startswith('@'):
                key_val = line[1:].split(None, 1)
                key = key_val[0]
                val = key_val[1] if len(key_val) > 1 else ''
                header[key] = val
                if key.lower() == 'data':
                    data_started = True
                continue

            if not data_started:
                continue

            data_str = line
            if header.get('classLabel','false').lower() == 'true':
                lbl, data_str = line.split(':', 1)
                labels.append(lbl.strip())
                data_str = data_str.strip()

            vals = []
            ts_row = [] if parse_timestamps else None

            for tok in data_str.split(','):
                tok = tok.strip()
                if not tok:
                    continue

                if ':' in tok:
                    meta, val_str = tok.rsplit(':', 1)
                else:
                    meta, val_str = None, tok

                v = float(val_str)
                vals.append(v)

                if parse_timestamps:
                    if meta and ':' in meta:
                        _, ts_str = meta.split(':', 1)
                        try:
                            dt = datetime.strptime(ts_str, timestamp_format)
                        except ValueError:
                            dt = ts_str
                    else:
                        dt = None
                    ts_row.append(dt)

            series_list.append(vals)
            if parse_timestamps:
                timestamps_list.append(ts_row)

    X = np.array(series_list, dtype=float)
    y = np.array(labels) if labels else None
    return X, y, header, timestamps_list

# Load the NN5 daily dataset with timestamps
file_path = "../data/nn5_daily_dataset_without_missing_values.tsf"
X, y, header, ts = read_tsf(file_path, parse_timestamps=True, timestamp_format="%Y-%m-%d")

# Show metadata and shape
print("Header metadata:", header)
print("Data shape:", X.shape)
if y is not None:
    print("Unique labels:", np.unique(y))

# Preview the first series with timestamps
df0 = pd.DataFrame({
    "timestamp": ts[0],
    "value": X[0]
})
print("NN5 Daily - First Series Preview", df0.head(10))


Header metadata: {'relation': 'NN5', 'attribute': 'start_timestamp date', 'frequency': 'daily', 'horizon': '56', 'missing': 'false', 'equallength': 'true', 'data': ''}
Data shape: (111, 791)
NN5 Daily - First Series Preview              timestamp      value
0  1996-03-18 00-00-00  13.407029
1                 None  14.725057
2                 None  20.564059
3                 None  34.708050
4                 None  26.629819
5                 None  16.609977
6                 None  15.320295
7                 None  11.607143
8                 None  19.883787
9                 None  23.767007
