In [2]:
import pickle
from preprocess_config import *

# Read and Process Data:

In [3]:
with open(dataset_dict_path, 'rb') as f:
    raw_dataset_dict = pickle.load(f)

In [4]:
dataset_dict_path

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/pickle_files/bi_directional_norway_dataset_dict.pickle'

In [3]:
raw_dataset_dict['All-items']

Unnamed: 0,Category_id,Category,Year,Date,Price,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,...,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1,Indent,Weight,Parent,Parent_ID
0,0,All-items,2009,2009-01-15,88.7,,,,,,...,,,,,,0.786079,0,1000.0,,
1,0,All-items,2009,2009-02-15,89.4,0.786079,,,,,...,,,,,,0.000000,0,1000.0,,
2,0,All-items,2009,2009-03-15,89.4,0.000000,0.786079,,,,...,,,,,,0.335009,0,1000.0,,
3,0,All-items,2009,2009-04-15,89.7,0.335009,0.000000,0.786079,,,...,,,,,,0.222717,0,1000.0,,
4,0,All-items,2009,2009-05-15,89.9,0.222717,0.335009,0.000000,0.786079,,...,,,,,,0.554633,0,1000.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,0,All-items,2023,2023-01-15,126.1,0.158730,0.079460,-0.158856,0.317965,1.362747,...,0.247219,1.161839,0.586021,1.097521,-0.929453,0.395727,0,1000.0,,
169,0,All-items,2023,2023-02-15,126.6,0.395727,0.158730,0.079460,-0.158856,0.317965,...,0.901276,0.247219,1.161839,0.586021,1.097521,0.786786,0,1000.0,,
170,0,All-items,2023,2023-03-15,127.6,0.786786,0.395727,0.158730,0.079460,-0.158856,...,1.296615,0.901276,0.247219,1.161839,0.586021,1.091203,0,1000.0,,
171,0,All-items,2023,2023-04-15,129.0,1.091203,0.786786,0.395727,0.158730,0.079460,...,-0.241838,1.296615,0.901276,0.247219,1.161839,0.464038,0,1000.0,,


In [4]:
def train_test_split(dic: dict, year: int) -> dict:
    train = {}
    test = {}
    for key in dic.keys():
        category_df = dic[key]
        train[key] = category_df[category_df['Year'] <= year]
        test[key] = category_df[category_df['Year'] > year]
    return train, test

raw_train_data_dict, raw_test_data_dict = train_test_split(raw_dataset_dict, year=Year)

In [5]:
def get_relevant_features(dic: dict, drop_columns: list) -> dict:
    processed_dict = dic.copy()
    for key in list(processed_dict.keys()):
        processed_dict[key] = processed_dict[key].drop(columns=drop_columns)
        processed_dict[key].dropna(inplace=True)
    return processed_dict

drop_columns = ['Category_id', 'Category', 'Year', 'Date', 'Price', 'Indent', 'Weight', 'Parent', 'Parent_ID']

train_data_dict = get_relevant_features(raw_train_data_dict, drop_columns)

test_data_dict = get_relevant_features(raw_test_data_dict, drop_columns)

In [6]:
def get_coefficients(dic: dict) -> dict:
    weight_dict = {}
    for key in list(dic.keys()):
        max_dt = dic[key].Date.max()
        cat_weight = dic[key][dic[key]['Date']==max_dt].Weight.values[0]
        weight_dict[key] = cat_weight
    return weight_dict

weight_dict = get_coefficients(raw_train_data_dict)

# Sanity Tests:

In [7]:
train_data_dict['All-items']

Unnamed: 0,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,Inflation t-7,Inflation t-6,Inflation t-5,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1
13,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,0.000000,0.786079,0.542007
14,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,0.000000,0.215983
15,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,0.335009,-0.540834
16,-0.540834,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,0.222717,-0.108519
17,-0.108519,-0.540834,0.215983,0.542007,1.202858,0.110072,0.330943,0.332042,-0.221484,0.777350,-0.222717,-0.554633,0.554633,-0.544367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,0.451061,-0.720724,0.354925
140,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,0.451061,0.265369
141,0.265369,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.179856,-0.709223
142,-0.709223,0.265369,0.354925,-0.354925,0.711114,0.178571,0.178891,0.448632,0.000000,-0.089888,0.000000,-0.269179,0.269179,0.443853


In [11]:
weight_dict['All-items']

1000.0

In [9]:
len(train_data_dict.keys())

52

# Save Data:

In [12]:
with open(train_dataset_path, 'wb') as handle:
    pickle.dump(train_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(test_dataset_path, 'wb') as handle:
    pickle.dump(test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(coefficient_path, 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)