# Imports

In [1]:
import pickle
from preprocess_config import *

# Read + process Data

In [2]:
with open(dataset_dict_path, 'rb') as f:
    raw_dataset_dict = pickle.load(f)

In [3]:
def train_test_split(dic: dict, year: int) -> dict:
    train = {}
    test = {}
    for key in dic.keys():
        category_df = dic[key]
        train[key] = category_df[category_df['Year'] <= year]
        test[key] = category_df[category_df['Year'] > year]
    return train, test

raw_train_data_dict, raw_test_data_dict = train_test_split(raw_dataset_dict, year=Year)

In [4]:
def get_relevant_features(dic: dict, drop_columns: list) -> dict:
    processed_dict = dic.copy()
    for key in list(processed_dict.keys()):
        processed_dict[key] = processed_dict[key].drop(columns=drop_columns)
        processed_dict[key].dropna(inplace=True)
    return processed_dict

drop_columns = ['Category_id', 'Category', 'Year', 'Date', 'Price', 'Indent', 'Weight', 'Parent', 'Parent_ID']

train_data_dict = get_relevant_features(raw_train_data_dict, drop_columns)

test_data_dict = get_relevant_features(raw_test_data_dict, drop_columns)

In [5]:
def get_coefficients(dic: dict) -> dict:
    weight_dict = {}
    for key in list(dic.keys()):
        max_dt = dic[key].Date.max()
        cat_weight = dic[key][dic[key]['Date']==max_dt].Weight.values[0]
        weight_dict[key] = cat_weight
    return weight_dict

weight_dict = get_coefficients(raw_train_data_dict)

In [6]:
def get_sons(dic: dict) -> dict:
    for key in list(dic.keys()):
        max_dt = dic[key].Date.max()
        cat_weight = dic[key][dic[key]['Date']==max_dt].Weight.values[0]
        weight_dict[key] = cat_weight
    return weight_dict

In [7]:
train_data_dict['All-items']

Unnamed: 0,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,Inflation t-7,Inflation t-6,Inflation t-5,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1
13,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,0.000000,0.786079,0.542007
14,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,0.000000,0.215983
15,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,-0.540834
16,-0.540834,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,-0.108519
17,-0.108519,-0.540834,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,-0.544367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,0.451061,-0.720724,0.354925
140,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,0.451061,0.265369
141,0.265369,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,-0.709223
142,-0.709223,0.265369,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.443853


In [8]:
len(train_data_dict.keys())

52

In [9]:
with open(train_dataset_path, 'wb') as handle:
    pickle.dump(train_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(test_dataset_path, 'wb') as handle:
    pickle.dump(test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(coefficient_path, 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

------------

# Horizon Test Sets:

In [10]:
def get_relevant_features_horizon(dic: dict, drop_columns: list) -> dict:
    processed_dict = dic.copy()
    for key in list(processed_dict.keys()):
        processed_dict[key] = processed_dict[key].drop(columns=drop_columns)
        #processed_dict[key].dropna(inplace=True)
    return processed_dict

drop_columns = ['Category_id', 'Category', 'Year', 'Date', 'Price', 'Indent', 'Weight', 'Parent', 'Parent_ID']

In [11]:
with open(dataset_dict_path, 'rb') as f:
    hor_1_dataset_dict = pickle.load(f)

hor1_train_data_dict, hor1_test_data_dict = train_test_split(hor_1_dataset_dict, year=Year)
hor1_test_data_dict = get_relevant_features_horizon(hor1_test_data_dict, drop_columns)

print(f'columns: {hor1_test_data_dict["All-items"].columns}')
print(f'shape: {hor1_test_data_dict["All-items"].shape}')


columns: Index(['Inflation t-12', 'Inflation t-11', 'Inflation t-10', 'Inflation t-9',
       'Inflation t-8', 'Inflation t-7', 'Inflation t-6', 'Inflation t-5',
       'Inflation t-4', 'Inflation t-3', 'Inflation t-2', 'Inflation t-1',
       'Inflation t', 'Inflation t+1'],
      dtype='object')
shape: (29, 14)


In [12]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/pickle files/bi_directional_norway_2_period_dataset_dict.pickle', 'rb') as f:
    hor_2_dataset_dict = pickle.load(f)

hor2_train_data_dict, hor2_test_data_dict = train_test_split(hor_2_dataset_dict, year=Year)
hor2_test_data_dict = get_relevant_features_horizon(hor2_test_data_dict, drop_columns)

print(f'columns: {hor2_test_data_dict["All-items"].columns}')
print(f'shape: {hor2_test_data_dict["All-items"].shape}')


columns: Index(['Inflation t-12', 'Inflation t-11', 'Inflation t-10', 'Inflation t-9',
       'Inflation t-8', 'Inflation t-7', 'Inflation t-6', 'Inflation t-5',
       'Inflation t-4', 'Inflation t-3', 'Inflation t-2', 'Inflation t-1',
       'Inflation t', 'Inflation t+1', 'Inflation t+2'],
      dtype='object')
shape: (29, 15)


In [13]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/pickle files/bi_directional_norway_3_period_dataset_dict.pickle', 'rb') as f:
    hor_3_dataset_dict = pickle.load(f)

hor3_train_data_dict, hor3_test_data_dict = train_test_split(hor_3_dataset_dict, year=Year)
hor3_test_data_dict = get_relevant_features_horizon(hor3_test_data_dict, drop_columns)

print(f'columns: {hor3_test_data_dict["All-items"].columns}')
print(f'shape: {hor3_test_data_dict["All-items"].shape}')


columns: Index(['Inflation t-12', 'Inflation t-11', 'Inflation t-10', 'Inflation t-9',
       'Inflation t-8', 'Inflation t-7', 'Inflation t-6', 'Inflation t-5',
       'Inflation t-4', 'Inflation t-3', 'Inflation t-2', 'Inflation t-1',
       'Inflation t', 'Inflation t+1', 'Inflation t+2', 'Inflation t+3'],
      dtype='object')
shape: (29, 16)


In [14]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/pickle files/bi_directional_norway_4_period_dataset_dict.pickle', 'rb') as f:
    hor_4_dataset_dict = pickle.load(f)

hor4_train_data_dict, hor4_test_data_dict = train_test_split(hor_4_dataset_dict, year=Year)
hor4_test_data_dict = get_relevant_features_horizon(hor4_test_data_dict, drop_columns)


print(f'columns: {hor4_test_data_dict["All-items"].columns}')
print(f'shape: {hor4_test_data_dict["All-items"].shape}')


columns: Index(['Inflation t-12', 'Inflation t-11', 'Inflation t-10', 'Inflation t-9',
       'Inflation t-8', 'Inflation t-7', 'Inflation t-6', 'Inflation t-5',
       'Inflation t-4', 'Inflation t-3', 'Inflation t-2', 'Inflation t-1',
       'Inflation t', 'Inflation t+1', 'Inflation t+2', 'Inflation t+3',
       'Inflation t+4'],
      dtype='object')
shape: (29, 17)


In [15]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/pickle files/bi_directional_norway_8_period_dataset_dict.pickle', 'rb') as f:
    hor_8_dataset_dict = pickle.load(f)

hor8_train_data_dict, hor8_test_data_dict = train_test_split(hor_8_dataset_dict, year=Year)
hor8_test_data_dict = get_relevant_features_horizon(hor8_test_data_dict, drop_columns)


print(f'columns: {hor8_test_data_dict["All-items"].columns}')
print(f'shape: {hor8_test_data_dict["All-items"].shape}')


columns: Index(['Inflation t-12', 'Inflation t-11', 'Inflation t-10', 'Inflation t-9',
       'Inflation t-8', 'Inflation t-7', 'Inflation t-6', 'Inflation t-5',
       'Inflation t-4', 'Inflation t-3', 'Inflation t-2', 'Inflation t-1',
       'Inflation t', 'Inflation t+1', 'Inflation t+2', 'Inflation t+3',
       'Inflation t+4', 'Inflation t+5', 'Inflation t+6', 'Inflation t+7',
       'Inflation t+8'],
      dtype='object')
shape: (29, 21)


In [16]:
with open(hor1_test_dataset_path, 'wb') as handle:
    pickle.dump(hor1_test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(hor2_test_dataset_path, 'wb') as handle:
    pickle.dump(hor2_test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(hor3_test_dataset_path, 'wb') as handle:
    pickle.dump(hor3_test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(hor4_test_dataset_path, 'wb') as handle:
    pickle.dump(hor4_test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(hor8_test_dataset_path, 'wb') as handle:
    pickle.dump(hor8_test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)