In [1]:
import pickle
from preprocess_config import *

# Read and Process Data:

In [2]:
with open(dataset_dict_path, 'rb') as f:
    raw_dataset_dict = pickle.load(f)

In [3]:
dataset_dict_path

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/pickle_files/bi_directional_canada_dataset_dict.pickle'

In [4]:
raw_dataset_dict['All-items']

Unnamed: 0,Category_id,Category,Year,Date,Price,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,...,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1,Indent,Weight,Parent,Parent_ID
0,2,All-items,2013,2013-01-15,121.3,,,,,,...,,,,,,1.147554,0.0,1.0,,
1,2,All-items,2013,2013-02-15,122.7,1.147554,,,,,...,,,,,,0.162866,0.0,1.0,,
2,2,All-items,2013,2013-03-15,122.9,0.162866,1.147554,,,,...,,,,,,-0.162866,0.0,1.0,,
3,2,All-items,2013,2013-04-15,122.7,-0.162866,0.162866,1.147554,,,...,,,,,,0.244200,0.0,1.0,,
4,2,All-items,2013,2013-05-15,123.0,0.244200,-0.162866,0.162866,1.147554,,...,,,,,,0.000000,0.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,2,All-items,2022,2022-10-15,153.8,0.717784,0.065509,-0.327118,0.130719,0.656170,...,1.027055,0.898727,-0.138793,0.208261,0.697353,0.129955,0.0,1.0,,
118,2,All-items,2022,2022-11-15,154.0,0.129955,0.717784,0.065509,-0.327118,0.130719,...,1.420382,1.027055,0.898727,-0.138793,0.208261,-0.586130,0.0,1.0,,
119,2,All-items,2022,2022-12-15,153.1,-0.586130,0.129955,0.717784,0.065509,-0.327118,...,0.602613,1.420382,1.027055,0.898727,-0.138793,0.521174,0.0,1.0,,
120,2,All-items,2023,2023-01-15,153.9,0.521174,-0.586130,0.129955,0.717784,0.065509,...,1.392134,0.602613,1.420382,1.027055,0.898727,0.389106,0.0,1.0,,


In [5]:
def train_test_split(dic: dict, year: int) -> dict:
    train = {}
    test = {}
    for key in dic.keys():
        category_df = dic[key]
        train[key] = category_df[category_df['Year'] <= year]
        test[key] = category_df[category_df['Year'] > year]
    return train, test

raw_train_data_dict, raw_test_data_dict = train_test_split(raw_dataset_dict, year=Year)

In [6]:
def get_relevant_features(dic: dict, drop_columns: list) -> dict:
    processed_dict = dic.copy()
    for key in list(processed_dict.keys()):
        processed_dict[key] = processed_dict[key].drop(columns=drop_columns)
        processed_dict[key].dropna(inplace=True)
    return processed_dict

drop_columns = ['Category_id', 'Category', 'Year', 'Date', 'Price', 'Indent', 'Weight', 'Parent', 'Parent_ID']

train_data_dict = get_relevant_features(raw_train_data_dict, drop_columns)

test_data_dict = get_relevant_features(raw_test_data_dict, drop_columns)

In [7]:
def get_coefficients(dic: dict) -> dict:
    weight_dict = {}
    for key in list(dic.keys()):
        max_dt = dic[key].Date.max()
        cat_weight = dic[key][dic[key]['Date']==max_dt].Weight.values[0]
        weight_dict[key] = cat_weight
    return weight_dict

weight_dict = get_coefficients(raw_train_data_dict)

# Sanity Tests:

In [8]:
train_data_dict['All-items']

Unnamed: 0,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,Inflation t-7,Inflation t-6,Inflation t-5,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1
13,0.809066,0.325468,-0.244200,0.000000,-0.243605,0.162338,0.000000,0.081268,0.000000,0.244200,-0.162866,0.162866,1.147554,0.562476
14,0.562476,0.809066,0.325468,-0.244200,0.000000,-0.243605,0.162338,0.000000,0.081268,0.000000,0.244200,-0.162866,0.162866,0.320000
15,0.320000,0.562476,0.809066,0.325468,-0.244200,0.000000,-0.243605,0.162338,0.000000,0.081268,0.000000,0.244200,-0.162866,0.478089
16,0.478089,0.320000,0.562476,0.809066,0.325468,-0.244200,0.000000,-0.243605,0.162338,0.000000,0.081268,0.000000,0.244200,0.079460
17,0.079460,0.478089,0.320000,0.562476,0.809066,0.325468,-0.244200,0.000000,-0.243605,0.162338,0.000000,0.081268,0.000000,-0.158983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,-0.145879,0.000000,0.804981,0.294334,-0.661038,-0.583943,0.437637,0.292826,0.000000,-0.146520,0.293255,-0.439561,-0.146092,-0.073019
92,-0.073019,-0.145879,0.000000,0.804981,0.294334,-0.661038,-0.583943,0.437637,0.292826,0.000000,-0.146520,0.293255,-0.439561,0.437318
93,0.437318,-0.073019,-0.145879,0.000000,0.804981,0.294334,-0.661038,-0.583943,0.437637,0.292826,0.000000,-0.146520,0.293255,0.145349
94,0.145349,0.437318,-0.073019,-0.145879,0.000000,0.804981,0.294334,-0.661038,-0.583943,0.437637,0.292826,0.000000,-0.146520,-0.218103


In [9]:
weight_dict['All-items']

1.0

In [10]:
len(train_data_dict.keys())

293

# Save Data:

In [13]:
with open(train_dataset_path, 'wb') as handle:
    pickle.dump(train_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(test_dataset_path, 'wb') as handle:
    pickle.dump(test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(coefficient_path, 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
coefficient_path

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/Canada/data/coefficient_dict.pickle'