In [1]:
import pickle
from preprocess_config import *

# Read and Process Data:

In [2]:
with open(dataset_dict_path, 'rb') as f:
    raw_dataset_dict = pickle.load(f)

In [3]:
raw_dataset_dict['All items']

Unnamed: 0,Category_id,Category,Year,Date,Price,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,...,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1,Indent,Weight,Parent,Parent_ID
53849,8106.0,All items,1984,1984-01-15,101.900,,,,,,...,,,,,,0.489477,0.0,100.0,,
53850,8106.0,All items,1984,1984-02-15,102.400,0.489477,,,,,...,,,,,,0.195122,0.0,100.0,,
53851,8106.0,All items,1984,1984-03-15,102.600,0.195122,0.489477,,,,...,,,,,,0.486146,0.0,100.0,,
53852,8106.0,All items,1984,1984-04-15,103.100,0.486146,0.195122,0.489477,,,...,,,,,,0.290557,0.0,100.0,,
53853,8106.0,All items,1984,1984-05-15,103.400,0.290557,0.486146,0.195122,0.489477,,...,,,,,,0.289715,0.0,100.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54321,8106.0,All items,2023,2023-05-15,303.294,0.124049,0.367109,0.053028,0.369323,0.809829,...,0.385226,0.118127,-0.019302,1.313580,0.969089,0.180191,0.0,100.0,,
54322,8106.0,All items,2023,2023-06-15,303.841,0.180191,0.124049,0.367109,0.053028,0.369323,...,0.437442,0.385226,0.118127,-0.019302,1.313580,0.166725,0.0,100.0,,
54323,8106.0,All items,2023,2023-07-15,304.348,0.166725,0.180191,0.124049,0.367109,0.053028,...,0.096242,0.437442,0.385226,0.118127,-0.019302,0.629202,0.0,100.0,,
54324,8106.0,All items,2023,2023-08-15,306.269,0.629202,0.166725,0.180191,0.124049,0.367109,...,-0.079469,0.096242,0.437442,0.385226,0.118127,0.394950,0.0,100.0,,


In [4]:
def train_test_split(dic: dict, year: int) -> dict:
    train = {}
    test = {}
    for key in dic.keys():
        category_df = dic[key]
        train[key] = category_df[category_df['Year'] <= year]
        test[key] = category_df[category_df['Year'] > year]
    return train, test

raw_train_data_dict, raw_test_data_dict = train_test_split(raw_dataset_dict, year=Year)

In [5]:
def get_relevant_features(dic: dict, drop_columns: list) -> dict:
    processed_dict = dic.copy()
    for key in list(processed_dict.keys()):
        processed_dict[key] = processed_dict[key].drop(columns=drop_columns)
        processed_dict[key].dropna(inplace=True)
    return processed_dict

drop_columns = ['Category_id', 'Category', 'Year', 'Date', 'Price', 'Indent', 'Weight', 'Parent', 'Parent_ID']

train_data_dict = get_relevant_features(raw_train_data_dict, drop_columns)

test_data_dict = get_relevant_features(raw_test_data_dict, drop_columns)

In [6]:
def get_coefficients(dic: dict) -> dict:
    weight_dict = {}
    for key in list(dic.keys()):
        max_dt = dic[key].Date.max()
        cat_weight = dic[key][dic[key]['Date']==max_dt].Weight.values[0]
        weight_dict[key] = cat_weight
    return weight_dict

weight_dict = get_coefficients(raw_train_data_dict)

# Sanity Tests:

In [7]:
train_data_dict['All items']

Unnamed: 0,Inflation t-12,Inflation t-11,Inflation t-10,Inflation t-9,Inflation t-8,Inflation t-7,Inflation t-6,Inflation t-5,Inflation t-4,Inflation t-3,Inflation t-2,Inflation t-1,Inflation t,Inflation t+1
53862,0.472814,0.189753,0.000000,0.000000,0.285307,0.477328,0.383510,0.384986,0.289715,0.290557,0.486146,0.195122,0.489477,0.376648
53863,0.376648,0.472814,0.189753,0.000000,0.000000,0.285307,0.477328,0.383510,0.384986,0.289715,0.290557,0.486146,0.195122,0.468824
53864,0.468824,0.376648,0.472814,0.189753,0.000000,0.000000,0.285307,0.477328,0.383510,0.384986,0.289715,0.290557,0.486146,0.373483
53865,0.373483,0.468824,0.376648,0.472814,0.189753,0.000000,0.000000,0.285307,0.477328,0.383510,0.384986,0.289715,0.290557,0.279200
53866,0.279200,0.373483,0.468824,0.376648,0.472814,0.189753,0.000000,0.000000,0.285307,0.477328,0.383510,0.384986,0.289715,0.185701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54288,0.369596,0.584957,0.563764,-0.052378,-0.798284,-0.424370,0.088825,0.123328,0.218807,0.258149,0.355510,0.022627,0.054248,0.203120
54289,0.203120,0.369596,0.584957,0.563764,-0.052378,-0.798284,-0.424370,0.088825,0.123328,0.218807,0.258149,0.355510,0.022627,0.044570
54290,0.044570,0.203120,0.369596,0.584957,0.563764,-0.052378,-0.798284,-0.424370,0.088825,0.123328,0.218807,0.258149,0.355510,0.188816
54291,0.188816,0.044570,0.203120,0.369596,0.584957,0.563764,-0.052378,-0.798284,-0.424370,0.088825,0.123328,0.218807,0.258149,0.366634


In [8]:
weight_dict

{'Admission to movies, theaters, and concerts': 0.331,
 'Admissions': 0.647,
 'Airline fare': 0.684,
 'Alcoholic beverages': 0.972,
 'Alcoholic beverages at home': 0.607,
 'Alcoholic beverages away from home': 0.365,
 'All items': 100.0,
 'All items less energy': 92.81,
 'All items less food': 86.636,
 'All items less food and energy': 79.446,
 'All items less food and shelter': 53.343,
 'All items less food, shelter, and energy': 46.153,
 'All items less food, shelter, energy, and used cars and trucks': 43.756,
 'All items less medical care': 91.304,
 'All items less shelter': 66.707,
 'Apparel': 3.065,
 'Apparel less footwear': 2.392,
 'Apparel services other than laundry and dry cleaning': 0.029,
 'Apples': 0.076,
 'Appliances': 0.219,
 'Audio equipment': 0.044,
 'Bacon and related products': 0.065,
 'Bacon, breakfast sausage, and related products': 0.132,
 'Bakery products': 0.654,
 'Bananas': 0.079,
 'Bedroom furniture': 0.324,
 'Beef and veal': 0.426,
 'Beer, ale, and other malt 

In [9]:
len(train_data_dict.keys())

350

# Save Data:

In [10]:
with open(train_dataset_path, 'wb') as handle:
    pickle.dump(train_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(test_dataset_path, 'wb') as handle:
    pickle.dump(test_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(coefficient_path, 'wb') as handle:
    pickle.dump(weight_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)