In [4]:
import os
import numpy as np
import pandas as pd

In [None]:
dataset_dir = '/path/to/LTSF-Benchmark' 
# This folder needs to contain folders "electricity", "traffic", 
# "weather", and "ETT-small".
output_dir = '/path/to/the/output/folder'

dataset_paths = [
    os.path.join(dataset_dir, 'electricity', 'electricity.csv'),
    os.path.join(dataset_dir, 'traffic', 'traffic.csv'),
    os.path.join(dataset_dir, 'weather', 'weather.csv'),
    os.path.join(dataset_dir, 'ETT-small', 'TTh1.csv'),
    os.path.join(dataset_dir, 'ETT-small', 'TTh2.csv'),
    os.path.join(dataset_dir, 'ETT-small', 'TTm1.csv'),
    os.path.join(dataset_dir, 'ETT-small', 'TTm2.csv'),
]

output_paths = [
    os.path.join(output_dir, 'Electricity.npz'),
    os.path.join(output_dir, 'Traffic.npz'),
    os.path.join(output_dir, 'Weather.npz'),
    os.path.join(output_dir, 'ETTh1.npz'),
    os.path.join(output_dir, 'ETTh2.npz'),
    os.path.join(output_dir, 'ETTm1.npz'),
    os.path.join(output_dir, 'ETTm2.npz'),
]

for dataset_path, output_path in zip(dataset_paths, output_paths):
    if os.path.isfile(output_path):
        npz = np.load(output_path)
        data = npz['data']
        print(output_path, data.shape)
        continue
    data = []
    with open(dataset_path, 'r') as f:
        for line in f:
            line = line.rstrip('\n')
            line = line.split(',')
            if line[0] == 'date':
                continue
            line = line[1:]
            line = np.array(line, dtype=float)
            data.append(line)
    data = np.array(data)
    print(output_path, data.shape)
    np.savez_compressed(output_path, data=data)


In [2]:
def process_dataset(dataset_dir, output_dir, dataset_names, suffix, is_agg):
    subset_names = ['ca', 'gba', 'gla', 'sd', ]
    subset_districts = [None, [4, ], [7, 8, 12, ], [11, ], ]
    direction_map = {'E': 0, 'W': 1, 'S': 2, 'N': 3, }

    meta_path = os.path.join(
        dataset_dir, 'ca_meta.csv')

    for subset_name, subset_district in zip(subset_names, subset_districts):
        output_path = os.path.join(
            output_dir, f'{subset_name}_his_{suffix}.npz')
        if os.path.isfile(output_path):
            npz = np.load(output_path)
            data = npz['data']
            print(output_path, data.shape)
            continue

        subset_id = []
        lat = []
        lng = []
        direction = []
        with open(meta_path, 'r') as f:
            for line in f:
                line = line.rstrip('\n')
                line = line.split(',')
                if line[0] == 'ID':
                    continue

                district = int(line[3])
                if subset_district is None or district in subset_district:
                    subset_id.append(line[0])
                    lat.append(float(line[1]))
                    lng.append(float(line[2]))
                    direction.append(direction_map[line[8]])

        df = pd.DataFrame()
        tod = []
        dow = []

        for dataset_name in dataset_names:
            dataset_path = os.path.join(dataset_dir, dataset_name)
            df_tmp = pd.read_hdf(dataset_path)
            if subset_district is not None:
                df_tmp = df_tmp[subset_id]
            if is_agg:
                df_tmp = df_tmp.resample('15T').mean().round(0)
            df_tmp = df_tmp.fillna(0)
            tod_ = (df_tmp.index.values - df_tmp.index.values.astype('datetime64[D]')) / np.timedelta64(1, 'D')
            dow_ = df_tmp.index.dayofweek / 7

            tod.append(tod_)
            dow.append(dow_)

            df = pd.concat([df, df_tmp, ], ignore_index=True)

        data = df.values

        tod = np.concatenate(tod, axis=0)
        dow = np.concatenate(dow, axis=0)

        lat = np.array(lat)
        lng = np.array(lng)
        direction = np.array(direction)
        print(output_path, data.shape)
        np.savez_compressed(
            output_path, data=data, tod=tod, dow=dow, lat=lat, lng=lng, 
            direction=direction)
        

In [6]:
dataset_dir = 'data' 
# This folder needs to contain "ca_his_raw_2019.h5" and "ca_meta.csv".
output_dir = 'process_data'

dataset_names = [
    'ca_his_raw_2019.h5',
]

suffix = '2019_agg'
process_dataset(dataset_dir, output_dir, dataset_names, suffix, True)


  df_tmp = df_tmp.resample('15T').mean().round(0)


process_data\ca_his_2019_agg.npz (35040, 8600)


  df_tmp = df_tmp.resample('15T').mean().round(0)


process_data\gba_his_2019_agg.npz (35040, 2352)


  df_tmp = df_tmp.resample('15T').mean().round(0)


process_data\gla_his_2019_agg.npz (35040, 3834)


  df_tmp = df_tmp.resample('15T').mean().round(0)


process_data\sd_his_2019_agg.npz (35040, 716)
