In [25]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# NOTEBOOK DESCRIPTION
The goal of this notebook is to create matrices X & y for each time series separately and save it to .npy files.

# LIBS

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

# 0. INPUT DATA

In [2]:
# FOLDERS PATHS
cwd = os.path.dirname(os.getcwd())
folder_in = cwd + "/input_data/"
folder_gen = cwd + "/generated_data/"
folder_data = folder_gen + "dataset_separated_by_ts/"

# DATA
data = pd.read_csv(folder_in + "data.csv", index_col="timestamp").astype(np.float32)
data.index = pd.to_datetime(data.index)
data.columns = data.columns.astype(int)

# PARAMS
freq_str = "30min"
freq = 30
periods = 48
idx = pd.date_range("2009-07-20", "2010-12-27", freq=freq_str, closed="left")  

In [3]:
data.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-07-14 00:00:00,0.76,0.592,1.614,0.358,0.286,0.14,0.46,0.41,1.384,1.958,...,101.822006,168.178009,262.874023,151.40799,279.238007,125.795998,331.89801,109.206001,114.315994,278.312012
2009-07-14 00:30:00,0.808,0.588,0.566,0.206,0.288,0.252,0.498,0.368,1.372,3.26,...,94.399994,142.209991,214.033997,130.778,225.395996,102.136002,314.873962,90.020004,91.412003,216.598007


# 1. CREATE X & y

In [3]:
# CREATE DUMMIES
month = pd.Series(idx.month.astype(str), index=idx, name="month").apply(lambda x: "m{}".format(x))
day = pd.Series(idx.dayofweek.astype(str), index=idx, name="day").apply(lambda x: "d{}".format(x))
hour = pd.Series(idx.strftime("%H:%M"), index=idx, name="hour")

month_dummies = pd.get_dummies(month.sort_values()).reindex(idx)
day_dummies = pd.get_dummies(day.sort_values()).reindex(idx)
hour_dummies = pd.get_dummies(hour.sort_values()).reindex(idx)

# RE-ORDER COLUMNS
month_dummies_cols = ["m{}".format(month) for month in range(1, 13)]
day_dummies_cols = ["d{}".format(day) for day in range(7)]
hour_dummies_cols = pd.date_range("2017-1-1", periods=periods, freq="30min").strftime("%H:%M").tolist()
exog_cols = month_dummies_cols + day_dummies_cols + hour_dummies_cols

month_dummies = month_dummies.loc[:, month_dummies_cols]
day_dummies = day_dummies.loc[:, day_dummies_cols]
hour_dummies = hour_dummies.loc[:, hour_dummies_cols]
y_cols = ["H_{}".format(h) for h in range(1, periods+1)]


def get_lags(ts, lag_start, lag_end):
    """ Create rolling window DataFrame from input Series.
    """
    df_all = []

    for shift in range(lag_end, lag_start + 1):
        df_all.append(ts.shift(-shift).rename("lag_{}".format(shift)))
    df_all = pd.concat(df_all, axis=1)
    return df_all


def get_dataset(ts, idx_list):
    train_idx, val_idx = idx_list

    # create features
    lags = get_lags(ts, lag_start=-1, lag_end=-7*periods)

    y_reshaped = get_lags(ts, lag_start=periods-1, lag_end=0)
    y_reshaped.columns = y_cols

    X = pd.concat([month_dummies,
                   day_dummies,
                   hour_dummies,
                   lags,
                  ], axis=1)
        
    # train/val split
    X_train, y_reshaped_train = X.loc[train_idx], y_reshaped.loc[train_idx]
    X_val, y_reshaped_val = X.loc[val_idx], y_reshaped.loc[val_idx]
    
    cols = X.columns
    lag_cols = cols[cols.str.contains("lag")]

    return (X_train.loc[:, lag_cols], X_train.loc[:, exog_cols], 
            X_val.loc[:, lag_cols],  X_val.loc[:, exog_cols],
            y_reshaped_train, 
            y_reshaped_val)

## 1.1. Create indices for sampling
- as mentioned in TABLE II: <i> Impact of training dataset size on performance</i>, train & val sets are sampled by approx. 12x to reduce training time while still producing good performance
- sampling of train & val sets is done by first creating train_dict & val_dict, where dictionary keys are time series ids. E.g. train_dict[0] returns indices for a train set for ts_id = 0, whereas indices hold the information which samples of that particular time series will be used during training.

In [30]:
train_idx = pd.date_range("2009-07-27", "2010-07-12", freq="30min", closed="left", name="timestamp")
val_idx = pd.date_range("2010-07-12", freq="30min", periods=periods*7*12, name="timestamp")
test_idx = pd.date_range("2010-10-04", freq="30min", periods=periods*7*12, name="timestamp")
trainval_idx = train_idx.append(val_idx)

indices_train = np.arange(len(train_idx))
indices_val = np.arange(len(val_idx))

np.random.seed(0)

train_dict, val_dict = {}, {}
for ts_id in data.columns:
    train_dict[ts_id] = np.random.choice(indices_train, size=periods*7*4, replace=False)
    val_dict[ts_id] = np.random.choice(indices_val, size=periods*7, replace=False)
    
pickle.dump(train_dict, open(folder_gen + "train_dict.p", 'wb'))
pickle.dump(val_dict, open(folder_gen + "val_dict.p", 'wb'))

In [37]:
train_dict[0][:5]

array([14983,  4430, 11323, 16261,  2252])

## 1.2. Create X & y for each time series separately & save

In [None]:
y_val_all, y_test_all = [], []
y_val_pred_naive_all, y_test_pred_naive_all = [], []

for ts_id in data.columns:
    
    if ts_id % 50 == 0: print("writing, ts_id:", ts_id)
    
    # GET DATA FOR ONE TIME SERIES
    ts = data.loc[:, ts_id]
    
    # CALCULATE NAIVE MODEL PREDS
    ts_naive = get_lags(ts.shift(7*periods), lag_start=periods-1, lag_end=0)  # naive model = values of previous week
    ts_naive.columns = y_cols
    y_val_pred_naive = ts_naive.loc[val_idx]
    y_test_pred_naive = ts_naive.loc[test_idx]
    

    # CREATE X & y
    ## train/val
    (X_train_lags, X_train_exog, 
     X_val_lags, X_val_exog, 
     y_train, y_val
     ) = get_dataset(ts, (train_idx, val_idx))

    ## trainval/test
    (X_trainval_lags, X_trainval_exog, 
     X_test_lags, X_test_exog, 
     y_trainval, y_test
     ) = get_dataset(ts, (trainval_idx, test_idx))
    
    
    # USE ONLY A SUBSET OF ORIGINAL SAMPLES FOR TRAIN & VAL
    ## train
    X_train_lags = X_train_lags.iloc[train_dict[ts_id]]
    X_train_exog = X_train_exog.iloc[train_dict[ts_id]]
    y_train = y_train.iloc[train_dict[ts_id]]
    
    ## val
    X_val_lags = X_val_lags.iloc[val_dict[ts_id]]
    X_val_exog = X_val_exog.iloc[val_dict[ts_id]]
    y_val = y_val.iloc[val_dict[ts_id]]
    y_val_pred_naive = y_val_pred_naive.iloc[val_dict[ts_id]]

    # SAVE
    ## train
    np.save(folder_data + "X_train_ts_id={}, lags.npy".format(ts_id), X_train_lags)        
    np.save(folder_data + "X_train_ts_id={}, exog.npy".format(ts_id), X_train_exog)        
    np.save(folder_data + "y_train_ts_id={}.npy".format(ts_id), y_train)

    ## val
    np.save(folder_data + "X_val_ts_id={}, lags.npy".format(ts_id), X_val_lags)         
    np.save(folder_data + "X_val_ts_id={}, exog.npy".format(ts_id), X_val_exog)      
    np.save(folder_data + "y_val_ts_id={}.npy".format(ts_id), y_val)        

    ## test
    np.save(folder_data + "X_test_ts_id={}, lags.npy".format(ts_id), X_test_lags)   
    np.save(folder_data + "X_test_ts_id={}, exog.npy".format(ts_id), X_test_exog)        
    np.save(folder_data + "y_test_ts_id={}.npy".format(ts_id), y_test)
    
    y_val_pred_naive = y_val_pred_naive.assign(ts_id=ts_id)
    y_val_pred_naive_all.append(y_val_pred_naive)
    
    y_test_pred_naive = y_test_pred_naive.assign(ts_id=ts_id)
    y_test_pred_naive_all.append(y_test_pred_naive)
    
    y_val_ts = y_val.assign(ts_id=ts_id)
    y_val_all.append(y_val_ts)

    y_test_ts = y_test.assign(ts_id=ts_id)
    y_test_all.append(y_test_ts)
    

y_val_pred_naive_all = pd.concat(y_val_pred_naive_all).loc[:, ["ts_id"] + y_cols]
y_test_pred_naive_all = pd.concat(y_test_pred_naive_all).loc[:, ["ts_id"] + y_cols]

y_val_all = pd.concat(y_val_all).loc[:, ["ts_id"] + y_cols]
y_test_all = pd.concat(y_test_all).loc[:, ["ts_id"] + y_cols]

# SAVE
y_val_pred_naive_all.to_pickle(folder_gen + "y_val_pred_naive.p")
y_test_pred_naive_all.to_pickle(folder_gen + "y_test_pred_naive.p")

y_val_all.to_pickle(folder_gen + "y_val.p")
y_test_all.to_pickle(folder_gen + "y_test.p")