In [1]:
import sys
sys.path.insert(0, '/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/Norway/')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import random
from scipy import stats
import plotly.express as px
from sklearn.metrics import mean_squared_error, r2_score

import os
from pipeline_config import *


In [3]:
#Seeds for reproducability:

torch.manual_seed(1)
np.random.seed(2)
random.seed(3)

In [4]:
with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/Norway/bidirectional/test_predictions.pickle', 'rb') as f:
    prediction_dic = pickle.load(f)

with open('/Users/mvilenko/Library/CloudStorage/OneDrive-PayPal/hgru_clean/pickle_files/bi_directional_norway_dataset_dict.pickle', 'rb') as f:
    raw_dataset_dict = pickle.load(f)
    
with open(category_id_to_category_name_path, 'rb') as f:
    category_id_to_name_dict = pickle.load(f)
    
with open(categories_per_indent_path, 'rb') as f:
    categories_per_indent_dict = pickle.load(f)

In [5]:
def create_test_dataframe(raw_dataset_dict: dict):
    test_dict = {}
    for key in raw_dataset_dict.keys():
    #for key in ['All items']:
        df = raw_dataset_dict[key][['Category', 'Date', 'Year', 'Indent', 'Inflation t+1']]
        df.dropna(inplace=True)
        df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True)
        target_df = df[df['Year'] > Year]
        test_dict[key] = target_df
    return test_dict

test_dict = create_test_dataframe(raw_dataset_dict)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Inflation t+1': 'Actual'}, inplace=True

In [6]:
def get_df_with_predictions(prediction_dic: dict, dict_of_categories_df:dict) -> dict:
    all_data_dict = {}
    for key in list(prediction_dic.keys()):
        predictions = prediction_dic[key]
        prediction_df =  pd.DataFrame(predictions.flatten().detach().numpy())
        prediction_df.rename(columns = {0: 'Prediction'}, inplace=True)
        dict_of_categories_df[key] = dict_of_categories_df[key].reset_index(drop=True)
        all_data_dict[key] = pd.concat([dict_of_categories_df[key], prediction_df], axis=1)
    return all_data_dict

all_data_test_dict=get_df_with_predictions(prediction_dic, test_dict)

In [7]:
# sanity check
all_categories = list(all_data_test_dict.keys())
len(all_categories)

52

Average RMSE:

In [8]:
def avg_rmse(all_data_test_dict):
    mse_lst = []
    for key in all_categories:
        df_predictions = all_data_test_dict[key]
        y_pred = df_predictions['Prediction'].values
        y_actual = df_predictions['Actual'].values
        mse = mean_squared_error(y_pred, y_actual)
        mse_lst.append(mse)
    
    rmse_list = list(map(np.sqrt,mse_lst))
    avg_rmse = np.mean(rmse_list)
    rmse_std = np.std(rmse_list)
    
    print(f'RMSE:  {avg_rmse}')
    print(f'MSE std:  {rmse_std}')
    print(f'interval: {[avg_rmse-rmse_std, avg_rmse+rmse_std]}')

    return avg_rmse,rmse_std

avg_rmse(all_data_test_dict)

RMSE:  1.3860573530789315
MSE std:  1.293176771848149
interval: [0.09288058123078247, 2.6792341249270804]


(1.3860573530789315, 1.293176771848149)

Headline RMSE:

In [9]:
df_predictions = all_data_test_dict['All-items']
y_pred = df_predictions['Prediction'].values
y_actual = df_predictions['Actual'].values
mse = mean_squared_error(y_pred, y_actual)
rmse = np.sqrt(mse)


print(f'rmse: {rmse}')

rmse: 0.5165681676632432


In [10]:
print(f'y_pred: {y_pred}')
print(f'y_actual: {y_actual}')

y_pred: [ 0.43124837 -0.27186507  0.6230327   0.4417052   0.13567042  0.5905134
 -0.19573092  0.69192314  0.5770978  -0.4867415   0.27441767  0.12192511
  0.6873449   0.26860255  0.96623015  0.26226836  0.50665796  0.4940247
 -0.05639026  0.88330305  0.5065304   0.58559     0.4026984  -0.08660972
  0.25545537  0.42464894  0.2760346  -0.09068987]
y_actual: [ 0.6986928  -0.26143806  0.34843241 -0.08699435  0.34752424  0.86356322
  0.          1.02652741 -0.25564564  0.76498461  0.67510805 -0.92945325
  1.09752051  0.58602093  1.1618388   0.24721891  0.90127607  1.2966146
 -0.24183809  1.36274654  0.31796529 -0.15885627  0.07945968  0.15873019
  0.39572667  0.78678612  1.09120335  0.46403796]


Total Correlation:

In [11]:
def total_corr(all_data_test_dict):
    corr_dict = {}
    for key in all_categories:
        df_predictions = all_data_test_dict[key]
        y_pred = df_predictions['Prediction'].values
        y_actual = df_predictions['Actual'].values
        corr = stats.pearsonr(y_pred,y_actual)[0]
        corr_dict[key] =  corr
    
    total_corr = sum(corr_dict.values())
    
    num_high_corr = 0
    for category in corr_dict:
        if corr_dict[category] >= 0.5:
            num_high_corr +=1
    
    print(f'Number of categories with High Correlation: {num_high_corr}')
    
    return total_corr

total_corr(all_data_test_dict)

Number of categories with High Correlation: 23


23.041972415761112

In [12]:
def total_corr_all_items(all_data_test_dict):
    corr_dict = {}
    for key in ['All-items']:
        df_predictions = all_data_test_dict[key]
        y_pred = df_predictions['Prediction'].values
        y_actual = df_predictions['Actual'].values
        corr = stats.pearsonr(y_pred,y_actual)[0]
        corr_dict[key] =  corr
    
    total_corr = sum(corr_dict.values())
    
    num_high_corr = 0
    for category in corr_dict:
        if corr_dict[category] >= 0.5:
            num_high_corr +=1
    
    print(f'Number of categories with High Correlation: {num_high_corr}')
    
    return total_corr

total_corr_all_items(all_data_test_dict)

Number of categories with High Correlation: 0


0.44916427360074396

Average R2:

In [13]:
def avg_r_squared(all_data_test_dict):
    r_squared_lst = []

    for key in all_categories:
        #print(f'category: {key}')
        df_predictions = all_data_test_dict[key]
        y_pred = df_predictions['Prediction'].values
        y_actual = df_predictions['Actual'].values
        r2 = r2_score(y_actual, y_pred)
        r_squared_lst.append(r2)

        if key =='All-items':
            headline_r2 = r2
    
    avg_r2_score = np.mean(r_squared_lst)
    r2_std = np.std(r_squared_lst)
    
    print(f'Average R Squared:  {avg_r2_score}')
    print(f'R Squared std:  {r2_std}')
    print('--------------------------------------------------------')
    print(f'Headline R2: {headline_r2}')
    print(f'R2 list percentiles:\n[10: {np.percentile(r_squared_lst, 10)}, 25: {np.percentile(r_squared_lst, 25)}, 50: {np.percentile(r_squared_lst, 50)}, 75: {np.percentile(r_squared_lst, 75)}, 90: {np.percentile(r_squared_lst, 90)}]')

    return headline_r2, avg_r2_score

headline_r2, avg_r2_score = avg_r_squared(all_data_test_dict)

Average R Squared:  0.19047272360754824
R Squared std:  0.3070668534013856
--------------------------------------------------------
Headline R2: 0.11983815332823555
R2 list percentiles:
[10: -0.15972507972179048, 25: -0.022984213309078516, 50: 0.11544527126582604, 75: 0.47401048364482035, 90: 0.6860526585987026]


Plot Results:

In [14]:
def plot_results(all_data_dict, categories):
    category_samples = ['All-items']+random.sample(categories, 10)
    for category in category_samples:
        category_df = all_data_dict[category]
        fig = px.line(category_df, x="Date", y=["Actual", "Prediction"], title=f'{category} - Actual VS Prediction')
        fig.show()

plot_results(all_data_test_dict, all_categories)