# Imports

In [1]:
import sys
sys.path.insert(0, '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/mayas_project/hgru_model_norway/model/')

In [2]:
import pandas as pd
import numpy as np
import pickle
import itertools
import random
import torch
from model.utils import *

2023-06-18 20:42:05.449763: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from pipeline_config import *

# Seeds for Comparisons:

In [4]:
torch.manual_seed(1)
np.random.seed(2)
random.seed(3)

# Read Data

In [5]:
with open(son_parent_path, 'rb') as f:
    son_parent_dict = pickle.load(f)

with open(train_dataset_dict_path, 'rb') as f:
    train_dataset_dict = pickle.load(f)

with open(test_dataset_dict_path, 'rb') as f:
    test_dataset_dict = pickle.load(f)

with open(category_id_to_category_name_path, 'rb') as f:
    category_id_to_name_dict = pickle.load(f)
    
with open(categories_per_indent_path, 'rb') as f:
    categories_per_indent_dict = pickle.load(f)

In [6]:
Lr

0.04329823452489634

In [7]:
len(list(test_dataset_dict.keys()))

52

# Hierarchical GRU

In [8]:
def hgru_model(son_parent_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weights_path):
    hgru_models = {}

    num_categories = 0
    for indent in sorted(list(categories_per_indent_dict.keys())):
        for category in categories_per_indent_dict[indent]:
            num_categories +=1
            print(f'num categories: {num_categories}')
            category_name = category_id_to_name_dict[category]
            print(f'category id|name: {category}|{category_name}')

            if int(indent) == 0 or son_parent_dict[category] not in categories_per_indent_dict[indent-1]:
                loss_coef=0
                parent_weights=0
            else:
                son = category
                parent = son_parent_dict[son]
                parent_name = category_id_to_name_dict[parent]
                loss_coef = 0.03881843050613352
                parent_model = Model
                parent_optimizer = Optimizer
                parent_model, optimizer, checkpoint, valid_loss_min = load_checkpoint(weights_path+parent_name+'.pt', parent_model, parent_optimizer)
                parent_weights = unify_model_weights(parent_model)

            train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category_name], test_dataset_dict[category_name])
            model = Model
            optimizer = Optimizer
            model.to(Device)
            saving_param_path = weights_path+category_name+'.pt'
            training_and_evaluation(model, train_dataloader, test_dataloader, optimizer, category_name, parent_weights, loss_coef, path=saving_param_path)


In [9]:
hgru_models = hgru_model(son_parent_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weightspath)

num categories: 1
category id|name: 0|All-items
num categories: 2
category id|name: 2|Alcoholic beverages and tobacco
num categories: 3
category id|name: 3|Clothing and footwear
num categories: 4
category id|name: 8|Communications
num categories: 5
category id|name: 10|Education
num categories: 6
category id|name: 1|Food and non-alcoholic beverages
num categories: 7
category id|name: 5|Furnishings, household equipment and routine maintenance
num categories: 8
category id|name: 6|Health
num categories: 9
category id|name: 4|Housing, water, electricity, gas and other fuels
num categories: 10
category id|name: 12|Miscellaneous goods and services
num categories: 11
category id|name: 9|Recreation and culture
num categories: 12
category id|name: 11|Restaurants and hotels
num categories: 13
category id|name: 7|Transport
num categories: 14
category id|name: 45|Accommodation services
num categories: 15
category id|name: 19|Actual rentals for housing
num categories: 16
category id|name: 15|Alcoh

In [10]:
def get_results_on_test_set(weights_path, train_dataset_dict, test_dataset_dict, categories = None):
    predictions_dict = {}
    if categories is None:
        categories = list(test_dataset_dict.keys())
        
    for category in categories:
        print(category)
        train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category], test_dataset_dict[category])
        basic_model = Model
        basic_optimizer = Optimizer
        ckp_path = weights_path+category+'.pt'
        model, optimizer, checkpoint, valid_loss_min = load_checkpoint(ckp_path, basic_model, basic_optimizer)
        predictions = get_predictions_on_test_set(model, test_dataloader)
        predictions_dict[category] = predictions
    return predictions_dict

In [11]:
categories_lists = list(categories_per_indent_dict.values())
categories_id = list(itertools.chain.from_iterable(categories_lists))
categories = []
for category_id in categories_id:
    categories.append(category_id_to_name_dict[category_id])

In [12]:
predictions_dict = get_results_on_test_set(weightspath, train_dataset_dict, test_dataset_dict, categories = categories)

Accommodation services
Actual rentals for housing
Alcoholic beverages
Audio-visual, photographic and information processing equipment
Clothing
Electricity, gas and other fuels
Financial services n.e.c.
Food
Footwear
Furniture and furnishings, carpets and other floor coverings
Glassware, tableware and household utensils
Goods and services for routine household maintenance
Household appliances
Household textiles
Imputed rentals for housing
Insurance
Maintenance and repair of the dwelling
Medical products, appliances and equipment
Newspapers, books and stationery
Non-alcoholic beverages
Operation of personal transport equipment
Other major durables for recreation and culture
Other recreational items and equipment, gardens and pets
Other services n.e.c.
Out-patient services
Package holidays
Personal care
Personal effects n.e.c.
Postal services
Purchase of vehicles
Recreational and cultural services
Restaurant services
Social protection
Telephone equipment
Telephone services
Tobacco
Tools a

In [13]:
with open(test_predictions_path, 'wb') as handle:
    pickle.dump(predictions_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Get Best Model Weights:

In [14]:
def get_weights_per_category(category_id_list, dir_path):
    basic_model = GRUModel(input_dim = Features, hidden_dim = HiddenSize, layer_dim = LayersDim, output_dim = OutputDim, dropout_prob = DropoutProb)
    basic_optimizer = torch.optim.AdamW(basic_model.parameters(), lr=Lr)
    #basic_model.to(device)

    best_models_weights_dict = {}

    for category_id in category_id_list:
        category_name = category_id_to_name_dict[category_id]
        ckp_path = dir_path+category_name+'.pt'
        model, optimizer, checkpoint, valid_loss_min = load_checkpoint(ckp_path, basic_model, basic_optimizer)
        category_model_weights = unify_model_weights(model)
        best_models_weights_dict[category_id] = category_model_weights
        
    return best_models_weights_dict

In [15]:
dir_path = '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/mayas_project/hgru_model_norway/models_weights/'

In [16]:
category_id_list = []
 
# list out keys and values separately
key_list = list(category_id_to_name_dict.keys())
val_list = list(category_id_to_name_dict.values())

for cat_name in categories:
    position = val_list.index(cat_name)
    category_id_list.append(key_list[position])


In [17]:
len(categories)

52

In [18]:
len(category_id_list)

52

In [19]:
weights_dict = get_weights_per_category(category_id_list, dir_path)

In [20]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/CPI_HRNN - version 2.0/pickle files/hgru_model_weights_norway.pickle', 'wb') as handle:
    pickle.dump(weights_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)