In [1]:
import sys
sys.path.insert(0, '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/')

In [2]:
import torch
import pandas as pd
import numpy as np
import statistics
import torch
import random
import time
import numpy as np
#from transformers import AdamW
from torch.utils.tensorboard import SummaryWriter
import pickle
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import shutil
import itertools
import os
from pathlib import Path

from model.GRU_model import *
from pipeline_config import *
from utils import *

In [3]:
# Seeds for comparisons:

torch.manual_seed(1)
np.random.seed(2)
random.seed(3)
torch.use_deterministic_algorithms(True)


In [4]:
with open(train_dataset_path, 'rb') as f:
    train_dataset_dict = pickle.load(f)
    
with open(test_dataset_path, 'rb') as f:
    test_dataset_dict = pickle.load(f)

with open(category_id_to_category_name_path, 'rb') as f:
    category_id_to_name_dict = pickle.load(f)
    
with open(categories_per_indent_path, 'rb') as f:
    categories_per_indent_dict = pickle.load(f)

with open(son_parent_path, 'rb') as f:
    son_parent_dict = pickle.load(f)

with open(parent_to_son_list_path, 'rb') as f:
    parent_to_son_list_dict = pickle.load(f)

with open(hgru_model_weights_path, 'rb') as f:
    hgru_weight_dict = pickle.load(f)

with open(coefficient_dict_path, 'rb') as f:
    coefficient_dict = pickle.load(f)

In [5]:
print(f'Features: {Features}')
print(f'HiddenSize: {HiddenSize}')
print(f'LayersDim: {LayersDim}')
print(f'OutputDim: {OutputDim}')
print(f'DropoutProb: {DropoutProb}')
print(f'Lr : {Lr}')
print(f'Epochs: {Epochs}')

Features: 1
HiddenSize: 64
LayersDim: 1
OutputDim: 1
DropoutProb: 0.0
Lr : 0.125
Epochs: 100


In [6]:
def get_results_on_test_set(weights_path, train_dataset_dict, test_dataset_dict, categories = None):
    predictions_dict = {}
    if categories is None:
        categories = list(test_dataset_dict.keys())
        
    for category in categories:
        if category == 'Food and beverages':
            continue
        print(category)
        train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category], test_dataset_dict[category])
        basic_model =GRUModel(input_dim=Features, hidden_dim=HiddenSize, layer_dim=LayersDim, output_dim=OutputDim, dropout_prob=DropoutProb, seed=0)
        basic_optimizer = torch.optim.AdamW(basic_model.parameters(), lr=Lr)
        ckp_path = weights_path+category+'.pt'
        model, optimizer, checkpoint, valid_loss_min = load_checkpoint(ckp_path, basic_model, basic_optimizer)
        predictions = get_predictions_on_test_set(model, test_dataloader)
        predictions_dict[category] = predictions
    return predictions_dict

In [7]:
def rmdir(dirc):
    dirc = Path(dirc)
    for itm in dirc.iterdir():
        if itm.is_dir():
            rmdir(itm)
            print("Deleting", itm, ".... successful.")
        else:
            itm.unlink()
    dirc.rmdir()

# Loss Analysis 1:

No Loss Term 1:

In [8]:
# Sanity Check:
weightspath_1

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_1/'

In [9]:
def bi_directional_model_1(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weights_path, hgru_weight_dict, coefficient_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha):
    # deleting files in models_weights folder
    path = os.getcwd()
    desired_path = path + '/models_weights_1/'
    is_empty = not any(Path(desired_path).iterdir())

    if is_empty==False:
        rmdir(Path(desired_path))
    
    # deleting empty folder
    try:
        os.rmdir(desired_path)
    except OSError:
        print ("Deletion of the directory %s failed" % desired_path)
    else:
        print ("Successfully deleted the directory %s" % desired_path)

    # creating new folder
    try:
        os.mkdir(desired_path)
    except OSError:
        print ("Creation of the directory %s failed" % desired_path)
    else:
        print ("Successfully created the directory %s" % desired_path)
    #----------------------------------------------------------------------------------------------------------------------------------------------#
    loss_coef_1= 0
    loss_coef_2= 0.0001
    loss_coef_3= 2.2324747384839943*np.exp(-7)

    num_categories = 0
    bidrectional_models = {}


    for indent in sorted(list(categories_per_indent_dict.keys()),reverse=True):
        for category in categories_per_indent_dict[indent]:
            num_categories +=1
            print(f'num categories: {num_categories}')
            category_name = category_id_to_name_dict[category]
            print(f'category id|name: {category}|{category_name}')

            if (category not in list(parent_to_son_list_dict.keys())) or (set(parent_to_son_list_dict[category]['sons']).issubset(set(categories_per_indent_dict[indent+1]))):
                loss_coef_2 = 0   

            
            print('------------------------------------------------------------------')
            
            train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category_name], test_dataset_dict[category_name])
            model = Model
            optimizer = Optimizer
            model.to(Device)
            saving_param_path = weights_path+category_name+'.pt'
            
            bidrectional_models[category_name] = training_and_evaluation(model, indent, train_dataloader, test_dataloader, optimizer, category, hgru_weight_dict, coefficient_dict, son_parent_dict, parent_to_son_list_dict, category_id_to_name_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha, saving_param_path)
            
    return bidrectional_models

In [10]:
bidrectional_models_1 = bi_directional_model_1(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weightspath_1, hgru_weight_dict, coefficient_dict, 0, loss_coef_2, loss_coef_3, alpha)

Successfully deleted the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_1/
Successfully created the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_1/
num categories: 1
category id|name: 9540.0|Bacon and related products
------------------------------------------------------------------
num categories: 2
category id|name: 8302.0|Breakfast sausage and related products
------------------------------------------------------------------
num categories: 3
category id|name: 2135.0|Fresh and frozen chicken parts
------------------------------------------------------------------
num categories: 4
category id|name: 3415.0|Fresh whole chicken
------------------------------------------------------------------
num categories: 5
category id|name: 4850.0|Frozen fish and seafood
------------------------------------------------------------------
num categories: 6
category id|name: 9972.

Save prediction dict:

In [11]:
categories_lists = list(categories_per_indent_dict.values())
categories_id = list(itertools.chain.from_iterable(categories_lists))
categories = []
for category_id in categories_id:
    categories.append(category_id_to_name_dict[category_id])

In [12]:
predictions_dict_1 = get_results_on_test_set(weightspath_1, train_dataset_dict, test_dataset_dict, categories = categories)
with open(test_predictions_path_1, 'wb') as handle:
    pickle.dump(predictions_dict_1, handle, protocol=pickle.HIGHEST_PROTOCOL)

Admission to movies, theaters, and concerts
Bacon, breakfast sausage, and related products
Bread other than white
Canned fruits
Canned vegetables
Checking account and other bank services
Chicken
Cookies
Crackers, bread, and cracker products
Dried beans, peas, and lentils
Frankfurters
Fresh cakes and cupcakes
Fresh fish and seafood
Fresh sweetrolls, coffeecakes, doughnuts
Frozen and refrigerated bakery products, pies, tarts, turnovers
Frozen vegetables
Ham
Intracity mass transit
Lunchmeats
Oranges, including tangerines
Other condiments
Other pork including roasts and picnics
Other uncooked poultry including turkey
Parking fees and tolls
Pork chops
Prepared salads
Processed fish and seafood
Salt and other seasonings and spices
Sauces and gravies
Ship fare
Uncooked beef roasts
Uncooked beef steaks
Uncooked ground beef
Uncooked other beef and veal
Video discs and other media
White bread
Admissions
Airline fare
Apparel services other than laundry and dry cleaning
Apples
Bananas
Beef and vea

# Loss Analysis 2:

No Loss Term 2:

In [13]:
# Sanity Check:
weightspath_2

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_2/'

In [14]:
def bi_directional_model_2(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weights_path, hgru_weight_dict, coefficient_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha):
    # deleting files in models_weights folder
    path = os.getcwd()
    desired_path = path + '/models_weights_2/'
    is_empty = not any(Path(desired_path).iterdir())

    if is_empty==False:
        rmdir(Path(desired_path))
    
    # deleting empty folder
    try:
        os.rmdir(desired_path)
    except OSError:
        print ("Deletion of the directory %s failed" % desired_path)
    else:
        print ("Successfully deleted the directory %s" % desired_path)

    # creating new folder
    try:
        os.mkdir(desired_path)
    except OSError:
        print ("Creation of the directory %s failed" % desired_path)
    else:
        print ("Successfully created the directory %s" % desired_path)
    #----------------------------------------------------------------------------------------------------------------------------------------------#
    loss_coef_1= 8.092430201147471*np.exp(-7)
    loss_coef_2= 0
    loss_coef_3= 2.2324747384839943*np.exp(-7)
    
    num_categories = 0
    bidrectional_models = {}

    for indent in sorted(list(categories_per_indent_dict.keys()),reverse=True):
        for category in categories_per_indent_dict[indent]:
            num_categories +=1
            print(f'num categories: {num_categories}')
            category_name = category_id_to_name_dict[category]
            print(f'category id|name: {category}|{category_name}')

            if int(indent) == 0 or son_parent_dict[category] not in categories_per_indent_dict[indent-1]: 
                loss_coef_1 = 0
            
            print('------------------------------------------------------------------')
            
            train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category_name], test_dataset_dict[category_name])
            model = Model
            optimizer = Optimizer
            model.to(Device)
            saving_param_path = weights_path+category_name+'.pt'
            
            bidrectional_models[category_name] = training_and_evaluation(model, indent, train_dataloader, test_dataloader, optimizer, category, hgru_weight_dict, coefficient_dict, son_parent_dict, parent_to_son_list_dict, category_id_to_name_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha, saving_param_path)
            
    return bidrectional_models

In [15]:
bidrectional_models_2 = bi_directional_model_2(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weightspath_2, hgru_weight_dict, coefficient_dict, loss_coef_1, 0, loss_coef_3, alpha)

Deletion of the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_2/ failed
Successfully created the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_2/
num categories: 1
category id|name: 9540.0|Bacon and related products
------------------------------------------------------------------
num categories: 2
category id|name: 8302.0|Breakfast sausage and related products
------------------------------------------------------------------
num categories: 3
category id|name: 2135.0|Fresh and frozen chicken parts
------------------------------------------------------------------
num categories: 4
category id|name: 3415.0|Fresh whole chicken
------------------------------------------------------------------
num categories: 5
category id|name: 4850.0|Frozen fish and seafood
------------------------------------------------------------------
num categories: 6
category id|name: 9972.0|

In [16]:
predictions_dict_2 = get_results_on_test_set(weightspath_2, train_dataset_dict, test_dataset_dict, categories = categories)
with open(test_predictions_path_2, 'wb') as handle:
    pickle.dump(predictions_dict_2, handle, protocol=pickle.HIGHEST_PROTOCOL)

Admission to movies, theaters, and concerts
Bacon, breakfast sausage, and related products
Bread other than white
Canned fruits
Canned vegetables
Checking account and other bank services
Chicken
Cookies
Crackers, bread, and cracker products
Dried beans, peas, and lentils
Frankfurters
Fresh cakes and cupcakes
Fresh fish and seafood
Fresh sweetrolls, coffeecakes, doughnuts
Frozen and refrigerated bakery products, pies, tarts, turnovers
Frozen vegetables
Ham
Intracity mass transit
Lunchmeats
Oranges, including tangerines
Other condiments
Other pork including roasts and picnics
Other uncooked poultry including turkey
Parking fees and tolls
Pork chops
Prepared salads
Processed fish and seafood
Salt and other seasonings and spices
Sauces and gravies
Ship fare
Uncooked beef roasts
Uncooked beef steaks
Uncooked ground beef
Uncooked other beef and veal
Video discs and other media
White bread
Admissions
Airline fare
Apparel services other than laundry and dry cleaning
Apples
Bananas
Beef and vea

# Loss Analysis 3:

No Loss Term 3:

In [17]:
weightspath_3

'/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_3/'

In [18]:
def bi_directional_model_3(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weights_path, hgru_weight_dict, coefficient_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha):
    # deleting files in models_weights folder
    path = os.getcwd()
    desired_path = path + '/models_weights_3/'
    is_empty = not any(Path(desired_path).iterdir())

    if is_empty==False:
        rmdir(Path(desired_path))
    
    # deleting empty folder
    try:
        os.rmdir(desired_path)
    except OSError:
        print ("Deletion of the directory %s failed" % desired_path)
    else:
        print ("Successfully deleted the directory %s" % desired_path)

    # creating new folder
    try:
        os.mkdir(desired_path)
    except OSError:
        print ("Creation of the directory %s failed" % desired_path)
    else:
        print ("Successfully created the directory %s" % desired_path)
    #----------------------------------------------------------------------------------------------------------------------------------------------#
    loss_coef_1= 8.092430201147471*np.exp(-7)
    loss_coef_2= 0.0001
    loss_coef_3= 0
    num_categories = 0
    bidrectional_models = {}

    for indent in sorted(list(categories_per_indent_dict.keys()),reverse=True):
        for category in categories_per_indent_dict[indent]:
            num_categories +=1
            print(f'num categories: {num_categories}')
            category_name = category_id_to_name_dict[category]
            print(f'category id|name: {category}|{category_name}')

            if int(indent) == 0 or son_parent_dict[category] not in categories_per_indent_dict[indent-1]: 
                loss_coef_1 = 0


            if (category not in list(parent_to_son_list_dict.keys())) or (set(parent_to_son_list_dict[category]['sons']).issubset(set(categories_per_indent_dict[indent+1]))):
                loss_coef_2 = 0

            
            print('------------------------------------------------------------------')
            
            train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category_name], test_dataset_dict[category_name])
            model = Model
            optimizer = Optimizer
            model.to(Device)
            saving_param_path = weights_path+category_name+'.pt'
            
            bidrectional_models[category_name] = training_and_evaluation(model, indent, train_dataloader, test_dataloader, optimizer, category, hgru_weight_dict, coefficient_dict, son_parent_dict, parent_to_son_list_dict, category_id_to_name_dict, loss_coef_1, loss_coef_2, loss_coef_3, alpha, saving_param_path)
            
    return bidrectional_models

In [19]:
bidrectional_models_3 = bi_directional_model_3(son_parent_dict, parent_to_son_list_dict, train_dataset_dict, test_dataset_dict, categories_per_indent_dict, category_id_to_name_dict, weightspath_3, hgru_weight_dict, coefficient_dict, loss_coef_1, loss_coef_2, 0, alpha)

Deletion of the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_3/ failed
Successfully created the directory /Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/bidirectional/models_weights_3/
num categories: 1
category id|name: 9540.0|Bacon and related products
------------------------------------------------------------------
num categories: 2
category id|name: 8302.0|Breakfast sausage and related products
------------------------------------------------------------------
num categories: 3
category id|name: 2135.0|Fresh and frozen chicken parts
------------------------------------------------------------------
num categories: 4
category id|name: 3415.0|Fresh whole chicken
------------------------------------------------------------------
num categories: 5
category id|name: 4850.0|Frozen fish and seafood
------------------------------------------------------------------
num categories: 6
category id|name: 9972.0|

In [20]:
predictions_dict_3 = get_results_on_test_set(weightspath_3, train_dataset_dict, test_dataset_dict, categories = categories)
with open(test_predictions_path_3, 'wb') as handle:
    pickle.dump(predictions_dict_3, handle, protocol=pickle.HIGHEST_PROTOCOL)

Admission to movies, theaters, and concerts
Bacon, breakfast sausage, and related products
Bread other than white
Canned fruits
Canned vegetables
Checking account and other bank services
Chicken
Cookies
Crackers, bread, and cracker products
Dried beans, peas, and lentils
Frankfurters
Fresh cakes and cupcakes
Fresh fish and seafood
Fresh sweetrolls, coffeecakes, doughnuts
Frozen and refrigerated bakery products, pies, tarts, turnovers
Frozen vegetables
Ham
Intracity mass transit
Lunchmeats
Oranges, including tangerines
Other condiments
Other pork including roasts and picnics
Other uncooked poultry including turkey
Parking fees and tolls
Pork chops
Prepared salads
Processed fish and seafood
Salt and other seasonings and spices
Sauces and gravies
Ship fare
Uncooked beef roasts
Uncooked beef steaks
Uncooked ground beef
Uncooked other beef and veal
Video discs and other media
White bread
Admissions
Airline fare
Apparel services other than laundry and dry cleaning
Apples
Bananas
Beef and vea

In [21]:
# Sanity Tests:

In [22]:
predictions_dict_1['All items']

tensor([[0.3900, 0.4615, 0.6161, 0.6152, 0.4307, 0.4092, 0.3538, 0.3373, 0.4595,
         0.5726, 0.4603, 0.3940, 0.4518, 0.4131, 0.4598, 0.2798, 0.4238, 0.3821,
         0.2732, 0.3220, 0.3191, 0.3148, 0.2882, 0.2025, 0.4224, 0.2795, 0.3088,
         0.2965, 0.1510, 0.2575, 0.4136, 0.4764]])

In [23]:
predictions_dict_2['All items']

tensor([[ 0.3638,  0.3932,  0.4990,  0.6711,  0.6605,  0.6865,  0.4675,  0.4362,
          0.5180,  0.6245,  0.5234,  0.1004,  0.5921,  0.6707,  0.7617,  0.1017,
          0.6314,  0.6481, -0.2398,  0.4161,  0.4824,  0.2463,  0.0879,  0.3401,
          0.5862,  0.5806,  0.1641,  0.5058,  0.3834,  0.4272,  0.5588,  0.6183]])

In [24]:
predictions_dict_3['All items']

tensor([[0.1597, 0.2372, 0.3777, 0.4963, 0.5269, 0.5272, 0.3068, 0.1679, 0.2422,
         0.5261, 0.4435, 0.3552, 0.4830, 0.5401, 0.6028, 0.3517, 0.5528, 0.5039,
         0.1447, 0.3184, 0.4203, 0.3995, 0.2600, 0.2175, 0.5232, 0.3305, 0.1543,
         0.4007, 0.1918, 0.0839, 0.2242, 0.4889]])