In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', None)

In [None]:
%ls /TimeseriesDatasets/forecasting/fred
!head -n 5 /TimeseriesDatasets/forecasting/fred/ts_meta.csv

In [None]:
PATH = '/TimeseriesDatasets/forecasting/fred/'
OUTPUT_PATH = PATH + 'preprocessed/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [None]:
df_info = pd.read_csv(PATH+'FREDInfo.csv')
print(len(df_info))
print(df_info['FREDid'].nunique())
df_info.head()

In [None]:
unique_ids = df_info['FREDid'].unique()
print(len(unique_ids))
unique_ids[0]

In [None]:
df_info['SP'].unique(), df_info['Horizon'].unique()

In [None]:
df_meta = pd.read_csv(PATH+'ts_meta.csv')
print(len(df_meta))
print(df_meta['id'].nunique())
df_meta.head()

In [None]:
ids_meta = set(df_meta['id'].unique())
ids_info = set(df_info['FREDid'].unique())
common_elements = ids_meta & ids_info
print(len(common_elements) / len(ids_meta))
print(len(common_elements) / len(ids_info))

In [None]:
result = df_meta[df_meta['id'] == unique_ids[0]].iloc[0]
result

In [None]:
# import h5py

# with h5py.File(PATH+'raw_observations.h5', 'r') as file:
#     print(file.keys())

In [None]:
import pickle

with open(PATH + 'categories.pickle', 'rb') as file:
        categories = pickle.load(file)
len(categories)

In [None]:
data_complete = np.load(PATH + 'fred-complete.npz', allow_pickle=True)
data_train = np.load(PATH + 'fred-train.npz', allow_pickle=True)
data_test = np.load(PATH + 'fred-test.npz', allow_pickle=True)

In [None]:
print('Complete')
print(data_complete.shape)
print(data_complete[0].shape)

print('Train')
print(data_train.shape)
print(data_train[0].shape)

print('Test')
print(data_test.shape)
print(data_test[0].shape)

In [None]:
df_final = df_info
seasonalities = {s: dict() for s in df_final['SP'].unique()}

for i, (
    (index, info),
    series_complete
    ) in tqdm(enumerate(zip(
        df_final.iterrows(),
        data_complete
        )),
        total=len(df_final)
    ):
    # get series ID and extract its metadata
    fred_id = info['FREDid']
    metadata = df_meta[df_meta['id'] == fred_id]
    df_final.loc[index, metadata.columns] = metadata.iloc[0]

    # get seasonality and save series
    seasonality = df_final.at[df_final[df_final['FREDid'] == fred_id].index[0], 'SP']
    series_complete_clean = series_complete[~np.isnan(series_complete)] # remove nans
    seasonalities[seasonality][fred_id] = series_complete_clean

df_final.drop(['id'], axis=1, inplace=True)

In [None]:
df_final.to_csv(OUTPUT_PATH + 'FRED_meta_test.csv', index=False)

for seasonality, series in seasonalities.items():
    print(f'{seasonality}: {len(series)}')
    np.savez(OUTPUT_PATH + f'FRED_{seasonality}_test.npy', series)

## Load preprocessed files

In [None]:
df_meta = pd.read_csv(OUTPUT_PATH + 'FRED_meta.csv', low_memory=False)
df_meta.head(1)

In [None]:
data = np.load(OUTPUT_PATH + 'FRED_Yearly.npy', allow_pickle=True)
data = data[()] # the initial data is the zero-dimensional where the only element is the dictionary

In [None]:
for key, value in data.items():
    meta = df_meta[df_meta['FREDid'] == key]
    print(key, value.shape)
    print(meta['SP'].values[0])
    plt.plot(value)
    plt.show()
    break