In [1]:
import sys
sys.path.insert(0, '/Users/mvilenko/Desktop/CPI_HRNN - version 2.0')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import random
import plotly.express as px
from sklearn.metrics import mean_squared_error

from pipeline_config import *
from hgru_data_preprocess.preprocess_config import *

from Sibling_investigation.basic_predictions_dict import *
from Sibling_investigation.hrnn_predictions_dict import *
from Prices.prices_over_time import *


In [3]:
torch.manual_seed(1)
np.random.seed(2)
random.seed(3)

In [4]:
categories_per_indent_dict ={key: categories_per_indent_dict[key] for key in categories_per_indent_dict.keys() if key in [0,1,2,3]}
cat_ids = list(categories_per_indent_dict.values())

flat_cat_id = [item for sublist in cat_ids for item in sublist]

all_categories = []
for cat_id in flat_cat_id:
    all_categories.append(category_id_to_name_dict[cat_id])

In [5]:
slope_list = set(slope_cats).intersection(all_categories)
slope_list

{'All items less food and shelter',
 'Apparel',
 'Apparel less footwear',
 'Commodities',
 'Commodities less food',
 'Commodities less food and beverages',
 'Commodities less food and energy commodities',
 'Durables',
 'Energy',
 'Energy commodities',
 'Energy services',
 'Fuel oil and other fuels',
 'Fuels and utilities',
 'Household energy',
 'Motor fuel',
 'New and used motor vehicles',
 'Nondurables',
 'Nondurables less food',
 'Nondurables less food and apparel',
 'Nondurables less food and beverages',
 'Nondurables less food, beverages, and apparel',
 'Private transportation',
 'Transportation',
 'Transportation commodities less motor fuel',
 'Utilities and public transportation'}

In [6]:
with open('/Users/mvilenko/Desktop/CPI_HRNN - version 2.0/pickle files/hrnn_dataset_dict_new.pickle', 'rb') as f:
    raw_dataset_dict = pickle.load(f)

In [7]:
def create_test_dataframe(raw_dataset_dict: dict):
    test_dict = {}
    for key in raw_dataset_dict.keys():
        df = raw_dataset_dict[key][['Category', 'Date', 'Year', 'Inflation t+1']]
        df.dropna(inplace=True)
        df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True)
        target_df = df[df['Year'] > 2019]
        test_dict[key] = target_df
    return test_dict

test_dict = create_test_dataframe(raw_dataset_dict)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True

In [8]:
def get_df_with_predictions(prediction_dic: dict, dict_of_categories_df:dict) -> dict:
    all_data_dict = {}
    for key in list(prediction_dic.keys()):
        predictions = prediction_dic[key]
        prediction_df =  pd.DataFrame(predictions.flatten().detach().numpy())
        prediction_df.rename(columns = {0: 'Prediction'}, inplace=True)
        dict_of_categories_df[key] = dict_of_categories_df[key].reset_index(drop=True)
        all_data_dict[key] = pd.concat([dict_of_categories_df[key], prediction_df], axis=1)
    return all_data_dict

basic_data_dict=get_df_with_predictions(basic_prediction_dic, test_dict)
hrnn_data_dict=get_df_with_predictions(hrnn_prediction_dic, test_dict)

In [9]:
def create_comparison_dict(basic_data_dict, hrnn_data_dict):
    comparison_dict = {}
    for key in list(basic_data_dict.keys()):
        basic_df = basic_data_dict[key]
        basic_df.rename(columns={'Prediction':'Basic_Prediction'}, inplace = True)
        hrnn_df = hrnn_data_dict[key]
        hrnn_df.rename(columns={'Prediction':'HRNN_Prediction'}, inplace = True)
        final_df = pd.concat([basic_df,hrnn_df], axis=1)
        final_df = final_df.loc[:,~final_df.columns.duplicated()]
        comparison_dict[key] = final_df
        
    return comparison_dict

In [10]:
final_dict = create_comparison_dict(basic_data_dict, hrnn_data_dict)

In [11]:
def plot_results(final_dict, slope_list):
    for category in slope_list:
        category_df = final_dict[category]
        basic_y_pred = category_df['Basic_Prediction'].values
        hrnn_y_pred = category_df['HRNN_Prediction'].values
        y_actual = category_df['Actual'].values
        
        basic_mse = mean_squared_error(basic_y_pred, y_actual)
        hrnn_mse = mean_squared_error(hrnn_y_pred, y_actual)
        
        print(f'Category is: {category}')
        print(f'RMSE difference is: {np.sqrt(basic_mse)-np.sqrt(hrnn_mse)}')

        fig = px.line(category_df, x="Date", y=["Actual", "Basic_Prediction", "HRNN_Prediction"], title=f'{category} - Actual VS HRNN Prediction and Basic Prediction')
        fig.show()

In [12]:
plot_results(final_dict, slope_list)

Category is: Transportation
RMSE difference is: 0.2123170874672966


Category is: Apparel less footwear
RMSE difference is: -0.1764580220308689


Category is: Commodities less food and beverages
RMSE difference is: 0.026863968089585466


Category is: Nondurables less food, beverages, and apparel
RMSE difference is: 0.2812347569877085


Category is: Commodities
RMSE difference is: 0.09725190861786437


Category is: New and used motor vehicles
RMSE difference is: 0.12727260567198906


Category is: Utilities and public transportation
RMSE difference is: 0.021652589422547996


Category is: Nondurables less food and beverages
RMSE difference is: 0.10316839780415155


Category is: Commodities less food
RMSE difference is: 0.03394357328678921


Category is: Motor fuel
RMSE difference is: 1.0284373664270774


Category is: Household energy
RMSE difference is: 0.30924126015973585


Category is: Energy commodities
RMSE difference is: 1.012230397301927


Category is: Fuels and utilities
RMSE difference is: 0.2453442520136092


Category is: Nondurables
RMSE difference is: 0.17978671709908345


Category is: Nondurables less food
RMSE difference is: 0.09501568831265761


Category is: All items less food and shelter
RMSE difference is: 0.22311830508134334


Category is: Transportation commodities less motor fuel
RMSE difference is: -0.0385432686998306


Category is: Fuel oil and other fuels
RMSE difference is: 1.1812522273645518


Category is: Private transportation
RMSE difference is: 0.16013723210379593


Category is: Energy services
RMSE difference is: -0.7221667609220412


Category is: Apparel
RMSE difference is: 0.029516659232938558


Category is: Nondurables less food and apparel
RMSE difference is: 0.19037518544694265


Category is: Commodities less food and energy commodities
RMSE difference is: 0.03945757480891321


Category is: Energy
RMSE difference is: 0.37238126364397095


Category is: Durables
RMSE difference is: -0.08303058018395859


------------------

# We'll start by investigating visually:

In [13]:
with open(son_parent_path, 'rb') as f:
    son_parent_dict = pickle.load(f)

with open(category_id_to_category_name_path, 'rb') as f:
    category_id_to_name_dict = pickle.load(f)


In [14]:
def find_siblings(categories, son_parent_dict, category_id_to_name_dict):
    siblings_dict = {}
    for sibling in categories:
        siblings = []
        for cat_id in list(category_id_to_name_dict.keys()):
            if category_id_to_name_dict[cat_id] == sibling:
                catergory_id = cat_id
                continue
       
        for son in list(son_parent_dict.keys()):
            if son_parent_dict[son] == son_parent_dict[catergory_id]:
                siblings.append(category_id_to_name_dict[son])
                
        siblings.remove(category_id_to_name_dict[catergory_id])
        
        siblings_dict[category_id_to_name_dict[catergory_id]] = siblings
    
    return siblings_dict
            

In [15]:
siblings_dict = find_siblings(slope_list, son_parent_dict, category_id_to_name_dict)

In [16]:
def plot_siblings(final_dict, siblings_dict, category):
    my_list = [category] + siblings_dict[category]
    plot_results(final_dict, my_list)

In [17]:
plot_siblings(final_dict, siblings_dict, 'Apparel')

Category is: Apparel
RMSE difference is: 0.029516659232938558


Category is: Alcoholic beverages
RMSE difference is: 0.08887015538551313


Category is: Education and communication commodities
RMSE difference is: 0.06796223056792172


Category is: Household furnishings and supplies
RMSE difference is: 0.010154515674599796


Category is: Medical care commodities
RMSE difference is: 0.03450337195174058


Category is: Other goods
RMSE difference is: 0.03756164907593684


Category is: Recreation commodities
RMSE difference is: -0.04587917792470142


Category is: Transportation commodities less motor fuel
RMSE difference is: -0.0385432686998306


In [20]:
plot_siblings(final_dict, siblings_dict, 'All items less food and shelter')

Category is: All items less food and shelter
RMSE difference is: 0.22311830508134334


Category is: All items less energy
RMSE difference is: 0.148538981343026


Category is: All items less food
RMSE difference is: 0.11329487947013583


Category is: All items less food and energy
RMSE difference is: 0.10095018187073052


Category is: All items less food, shelter, and energy
RMSE difference is: 0.12993019940711598


Category is: All items less food, shelter, energy, and used cars and trucks
RMSE difference is: 0.11129427884150894


Category is: All items less medical care
RMSE difference is: 0.029041258750601706


Category is: All items less shelter
RMSE difference is: 0.09112121598894118


Category is: Apparel less footwear
RMSE difference is: -0.1764580220308689


Category is: Commodities
RMSE difference is: 0.09725190861786437


Category is: Durables
RMSE difference is: -0.08303058018395859


Category is: Education and communication
RMSE difference is: -0.008222794847855458


Category is: Energy
RMSE difference is: 0.37238126364397095


Category is: Food
RMSE difference is: 0.2632445022308818


Category is: Food and beverages
RMSE difference is: 0.2176153565903352


Category is: Fuels and utilities
RMSE difference is: 0.2453442520136092


Category is: Household furnishings and operations
RMSE difference is: -0.03345192946421316


Category is: Housing
RMSE difference is: 0.21055076515709203


Category is: Medical care
RMSE difference is: 0.0830735787169512


Category is: Nondurables
RMSE difference is: 0.17978671709908345


Category is: Other goods and services
RMSE difference is: 0.13817502871076032


Category is: Other services
RMSE difference is: 0.024389810201163153


Category is: Recreation
RMSE difference is: -0.017612706968251568


Category is: Services
RMSE difference is: 0.08682935249821289


Category is: Transportation
RMSE difference is: 0.2123170874672966


Category is: Utilities and public transportation
RMSE difference is: 0.021652589422547996


In [21]:
plot_siblings(final_dict, siblings_dict, 'Commodities less food and energy commodities')

Category is: Commodities less food and energy commodities
RMSE difference is: 0.03945757480891321


Category is: Services less energy services
RMSE difference is: 0.11520670096557828


In [22]:
plot_siblings(final_dict, siblings_dict, 'Energy')

Category is: Energy
RMSE difference is: 0.37238126364397095


Category is: All items less energy
RMSE difference is: 0.148538981343026


Category is: All items less food
RMSE difference is: 0.11329487947013583


Category is: All items less food and energy
RMSE difference is: 0.10095018187073052


Category is: All items less food and shelter
RMSE difference is: 0.22311830508134334


Category is: All items less food, shelter, and energy
RMSE difference is: 0.12993019940711598


Category is: All items less food, shelter, energy, and used cars and trucks
RMSE difference is: 0.11129427884150894


Category is: All items less medical care
RMSE difference is: 0.029041258750601706


Category is: All items less shelter
RMSE difference is: 0.09112121598894118


Category is: Apparel less footwear
RMSE difference is: -0.1764580220308689


Category is: Commodities
RMSE difference is: 0.09725190861786437


Category is: Durables
RMSE difference is: -0.08303058018395859


Category is: Education and communication
RMSE difference is: -0.008222794847855458


Category is: Food
RMSE difference is: 0.2632445022308818


Category is: Food and beverages
RMSE difference is: 0.2176153565903352


Category is: Fuels and utilities
RMSE difference is: 0.2453442520136092


Category is: Household furnishings and operations
RMSE difference is: -0.03345192946421316


Category is: Housing
RMSE difference is: 0.21055076515709203


Category is: Medical care
RMSE difference is: 0.0830735787169512


Category is: Nondurables
RMSE difference is: 0.17978671709908345


Category is: Other goods and services
RMSE difference is: 0.13817502871076032


Category is: Other services
RMSE difference is: 0.024389810201163153


Category is: Recreation
RMSE difference is: -0.017612706968251568


Category is: Services
RMSE difference is: 0.08682935249821289


Category is: Transportation
RMSE difference is: 0.2123170874672966


Category is: Utilities and public transportation
RMSE difference is: 0.021652589422547996


Category is: Fuel oil and other fuels
RMSE difference is: 1.1812522273645518
