In [1]:
import sys
sys.path.insert(0, '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/')

In [2]:
import torch
import pandas as pd
import numpy as np
import statistics
import torch
import random
import time
import numpy as np
#from transformers import AdamW
from torch.utils.tensorboard import SummaryWriter
import pickle
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import shutil
import itertools

from model.GRU_model import *
from pipeline_config import *
from utils import *

In [3]:
# Seeds for comparisons:

torch.manual_seed(1)
np.random.seed(2)
random.seed(3)

In [4]:
with open(train_dataset_path, 'rb') as f:
    train_dataset_dict = pickle.load(f)
    
with open(test_dataset_path, 'rb') as f:
    test_dataset_dict = pickle.load(f)

with open(category_id_to_category_name_path, 'rb') as f:
    category_id_to_name_dict = pickle.load(f)
    
with open(categories_per_indent_path, 'rb') as f:
    categories_per_indent_dict = pickle.load(f)

In [5]:
categories_lists = list(categories_per_indent_dict.values())
categories_id = list(itertools.chain.from_iterable(categories_lists))
categories = []
for category_id in categories_id:
    categories.append(category_id_to_name_dict[category_id])

In [6]:
# sanity check:
len(categories)

350

# Pipeline:

In [7]:
def pipline(train_dataset_dict, test_dataset_dict):
    results = {}
    for category in categories:
        train_dataloader, test_dataloader = create_dataloader(train_dataset_dict[category], test_dataset_dict[category])

        model = GRUModel(input_dim = Features, hidden_dim = HiddenSize, layer_dim = LayersDim, output_dim = OutputDim, dropout_prob = DropoutProb)
        model.to(device)
        
        optimizer = torch.optim.AdamW(model.parameters(), lr=Lr)

        parameters_file_name = category+'.pt'
        
        results[category] = training_and_evaluation(
                                model=model,
                                optim=optimizer,
                                train_dataloader=train_dataloader,
                                test_dataloader=test_dataloader,
                                category=category,
                                checkpoint_path=CheckpointPath+parameters_file_name,
                                best_checkpoint_path=BestcheckpointPath+parameters_file_name,
                            )
    
    return results

In [8]:
results = pipline(train_dataset_dict, test_dataset_dict)

y test is: tensor([ 0.6391,  1.0857, -0.0336, -0.6105,  0.9351,  1.7115, -0.6477, -0.0623,
         0.1235,  0.7583,  0.6077,  0.8134, -0.1959,  1.1259,  1.0698,  0.0103,
         0.8903,  1.4829, -0.5711, -0.5831,  0.9011,  1.5884, -0.2634,  0.8523,
        -0.1651,  1.0952,  1.4112, -0.2530,  0.4848,  0.0835, -0.1169, -0.3758])
y test is: tensor([-0.1851,  1.2816,  2.8335,  0.8918,  1.9928,  0.6837,  2.8696,  1.7859,
         2.3512,  2.2472,  0.0295, -1.7791,  1.2989,  0.9089,  2.1129,  0.2981,
        -1.7279,  1.0111,  0.1046,  0.5841, -0.9158, -0.9560, -0.4712, -0.3472,
        -0.5278,  0.2695, -2.1167, -0.9136, -1.2713, -0.3369,  1.8595,  2.6905])
y test is: tensor([-0.4559, -0.3370,  0.7200,  0.9577, -0.8949,  1.0335,  0.1873,  1.1615,
         1.1231,  0.9191,  0.5698,  1.1466,  0.4159,  0.7830,  1.9174,  0.4281,
         1.3073,  3.4226,  1.6978,  0.4899,  0.0441,  2.2267, -0.7486,  0.7856,
         1.6568, -0.3154, -0.7768,  0.8193,  0.9314,  0.9311, -1.2193,  0.0214])
y te

Save pipeline results:

In [9]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/data/model_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Save best model per category:

In [10]:
categories_list = categories
dir_path = "checkpoints/best_checkpoints/"

best_models_dict = create_dict_of_best_model_per_category(categories_list, dir_path)

Save best predictions per category:

In [11]:
best_predictions_dict = get_best_predictions_for_each_category(best_models_dict, train_dataset_dict, test_dataset_dict)

with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/basic/predictions_dict.pickle', 'wb') as handle:
    pickle.dump(best_predictions_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

y test is: tensor([ 0.6391,  1.0857, -0.0336, -0.6105,  0.9351,  1.7115, -0.6477, -0.0623,
         0.1235,  0.7583,  0.6077,  0.8134, -0.1959,  1.1259,  1.0698,  0.0103,
         0.8903,  1.4829, -0.5711, -0.5831,  0.9011,  1.5884, -0.2634,  0.8523,
        -0.1651,  1.0952,  1.4112, -0.2530,  0.4848,  0.0835, -0.1169, -0.3758])
y test is: tensor([-0.1851,  1.2816,  2.8335,  0.8918,  1.9928,  0.6837,  2.8696,  1.7859,
         2.3512,  2.2472,  0.0295, -1.7791,  1.2989,  0.9089,  2.1129,  0.2981,
        -1.7279,  1.0111,  0.1046,  0.5841, -0.9158, -0.9560, -0.4712, -0.3472,
        -0.5278,  0.2695, -2.1167, -0.9136, -1.2713, -0.3369,  1.8595,  2.6905])
y test is: tensor([-0.4559, -0.3370,  0.7200,  0.9577, -0.8949,  1.0335,  0.1873,  1.1615,
         1.1231,  0.9191,  0.5698,  1.1466,  0.4159,  0.7830,  1.9174,  0.4281,
         1.3073,  3.4226,  1.6978,  0.4899,  0.0441,  2.2267, -0.7486,  0.7856,
         1.6568, -0.3154, -0.7768,  0.8193,  0.9314,  0.9311, -1.2193,  0.0214])
y te

Save best weights per category:

In [12]:
dir_path = '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/US/basic/checkpoints/best_checkpoints/'

category_id_list = []
 
# list out keys and values separately
key_list = list(category_id_to_name_dict.keys())
val_list = list(category_id_to_name_dict.values())

for cat_name in categories:
    position = val_list.index(cat_name)
    category_id_list.append(key_list[position])


In [13]:
weights_dict = get_weights_per_category(category_id_list, category_id_to_name_dict, dir_path)

In [14]:
with open(sgru_model_weights_path, 'wb') as handle:
    pickle.dump(weights_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)