In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from seminartools.models.four_quarter_average_model import FourQuarterAverageModel
from seminartools.data import read_merged

# 1. Read data 

In [3]:
# expanding window crossval
from seminartools.time_series_split import ExpandingWindowSplit

tss = ExpandingWindowSplit(start_date = "2000-01-01")
tss

<seminartools.time_series_split.ExpandingWindowSplit at 0x7efc2b07bdd0>

In [4]:
df = read_merged(coreInf=True).reset_index()
df['country'].unique()

array(['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands'], dtype=object)

# 2. get summary stats for all models

In [5]:
from seminartools.models.utils import get_stats
from seminartools.models.four_quarter_average_model import FourQuarterAverageModel
from seminartools.models.pca_var_model import PCAVARModel
from seminartools.models.armax_model import ARMAXModel
from seminartools.models.uc_sv_ss_model import UCSVSSModel
from seminartools.models.random_effects_model import RandomEffectsModel
from seminartools.models.distance_model import DistanceModel
from seminartools.models.mucsvss_model import MUCSVSSModel

muc_model = MUCSVSSModel(num_particles=100008, stochastic_seasonality=True)
muc_model.load_from_disk("../../models/mucsvss_coreinf_model_intra_continent_100008_stochastic_all.pkl")
muc_model

I0000 00:00:1713117577.243373 3008191 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.


<seminartools.models.mucsvss_model.MUCSVSSModel at 0x7efbe0205fd0>

In [6]:
exog_columns = [
    "gdp_growth",
    "interest_rate",
    "unemployment_rate",
    "commodity_CRUDE_PETRO",
    "commodity_iNATGAS",
    "commodity_iAGRICULTURE",
    "commodity_iMETMIN",
    "commodity_iPRECIOUSMET",
]

stats = get_stats(
    [
        ("pca_var_2", lambda: PCAVARModel(num_pcs = 2)),
        ("pca_var_3", lambda: PCAVARModel()),
        ("pca_var_4", lambda: PCAVARModel(num_pcs = 4)),
        ("pca_var_5", lambda: PCAVARModel(num_pcs = 5)),
        ("pca_var_6", lambda: PCAVARModel(num_pcs = 6)),
        # ("4qa", lambda: FourQuarterAverageModel()),
        # #("pca_var", lambda: PCAVARModel()),
        # ("ar", lambda: ARMAXModel(max_p=2, max_q=0)),
        # (
        #     "arx",
        #     lambda: ARMAXModel(max_p=2, max_q=0, exogenous_columns=exog_columns),
        # ),
        # ("arma", lambda: ARMAXModel(max_p=2, max_q=2)),
        # (
        #     "armax",
        #     lambda: ARMAXModel(max_p=2, max_q=2, exogenous_columns=exog_columns),
        # ),
        # (
        #     "uc_sv",
        #     lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=False),
        # ),
        # (
        #     "uc_sv_ss",
        #     lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=True),
        # ),
        # ("random_effects", lambda: RandomEffectsModel()),
        # (
        #     "random_effects_x",
        #     lambda: RandomEffectsModel(exogenous_columns=exog_columns),
        # ),
        # ("distance", lambda: DistanceModel()),
        # ("distance_x", lambda: DistanceModel(exogenous_columns=exog_columns)),
        # ("mucsvss", lambda: muc_model),
    ],
    df,
    tss,
    num_cores_parallel_models=1,  # only 1 model at once because models use joblib internally and nested joblib gives issues
)
stats



Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic,p-value
pca_var_3,2.9e-05,0.004109,0.138627,0.002249,0.696019,0.258225,81.42243,1.643641233966227e-33
pca_var_2,3e-05,0.00429,0.11844,0.002378,0.82981,0.256863,94.06559,3.448168236774377e-38
pca_var_4,0.000267,0.006869,-6.961754,0.003891,0.042604,0.012813,3567.87,0.0
pca_var_5,0.158756,0.091706,-4726.554906,0.003891,-0.000781,0.002875,2393793.0,0.0
pca_var_6,0.182765,0.113306,-5441.540931,0.003843,-0.000608,0.001964,2753387.0,0.0


In [7]:
# format r2 column to 2 
format_stats = stats.copy()
format_stats['p-value'] =format_stats['p-value'].astype(float)
round_cols = ["r2", "mz_intercept", "mz_slope", "mz_r2", "mz_wald_test_statistic", "p-value"]
for col in round_cols:
    format_stats[col] = format_stats[col].apply(lambda x: round(x, 3))

format_stats['mz_wald_test_statistic'] = format_stats['mz_wald_test_statistic'].astype(str) + '(' + format_stats['p-value'].astype(str) + ')'
format_stats.drop(columns = ['p-value'], inplace = True)
format_stats

Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic
pca_var_3,2.9e-05,0.004109,0.139,0.002,0.696,0.258,81.422(0.0)
pca_var_2,3e-05,0.00429,0.118,0.002,0.83,0.257,94.066(0.0)
pca_var_4,0.000267,0.006869,-6.962,0.004,0.043,0.013,3567.87(0.0)
pca_var_5,0.158756,0.091706,-4726.555,0.004,-0.001,0.003,2393793.066(0.0)
pca_var_6,0.182765,0.113306,-5441.541,0.004,-0.001,0.002,2753386.552(0.0)


In [8]:
# generate latex code for the stats table
print(
    # replace _ by \_ in columns and index
    format_stats.pipe(lambda df: df.rename(columns=lambda x: x.replace("_", "\\_")))
    .pipe(lambda df: df.rename(index=lambda x: x.replace("_", "\\_")))
    .to_latex(
        index=True,
        caption="MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023",
        label="tab:model_eval",
        float_format="%.4f",
    )
)

\begin{table}
\caption{MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023}
\label{tab:model_eval}
\begin{tabular}{lrrrrrrl}
\toprule
 & mse & mae & r2 & mz\_intercept & mz\_slope & mz\_r2 & mz\_wald\_test\_statistic \\
\midrule
pca\_var\_3 & 0.0000 & 0.0041 & 0.1390 & 0.0020 & 0.6960 & 0.2580 & 81.422(0.0) \\
pca\_var\_2 & 0.0000 & 0.0043 & 0.1180 & 0.0020 & 0.8300 & 0.2570 & 94.066(0.0) \\
pca\_var\_4 & 0.0003 & 0.0069 & -6.9620 & 0.0040 & 0.0430 & 0.0130 & 3567.87(0.0) \\
pca\_var\_5 & 0.1588 & 0.0917 & -4726.5550 & 0.0040 & -0.0010 & 0.0030 & 2393793.066(0.0) \\
pca\_var\_6 & 0.1828 & 0.1133 & -5441.5410 & 0.0040 & -0.0010 & 0.0020 & 2753386.552(0.0) \\
\bottomrule
\end{tabular}
\end{table}



# 3. table HCPI using the 12 countries

In [9]:
df['country'].unique()

array(['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands'], dtype=object)

In [10]:
countries = ['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands']

In [11]:
dfs = read_merged(only_countries= countries).reset_index()
dfs

Unnamed: 0,index,country,date,inflation,gdp_growth,interest_rate,unemployment_rate,commodity_CRUDE_PETRO,commodity_iNATGAS,commodity_iAGRICULTURE,commodity_iMETMIN,commodity_iPRECIOUSMET
0,0,Japan,1977-01-01,0.022951,2.187257,6.333333,2.3,0.00000,0.052425,0.144757,0.041940,0.112740
1,1,Canada,1977-01-01,0.023904,1.533830,8.166667,8.8,0.00000,0.052425,0.144757,0.041940,0.112740
2,2,United States,1977-01-01,0.017231,1.186233,4.700000,8.2,0.00000,0.052425,0.144757,0.041940,0.112740
3,3,Japan,1977-04-01,0.027244,0.703788,5.000000,2.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
4,4,Canada,1977-04-01,0.023346,0.510302,7.666667,8.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
...,...,...,...,...,...,...,...,...,...,...,...,...
1317,2164,Netherlands,2023-01-01,-0.013667,-0.494664,3.000000,3.7,-0.02041,-0.585458,0.002037,0.007418,0.043183
1318,2165,Switzerland,2023-01-01,0.010084,0.309543,1.166667,4.4,-0.02041,-0.585458,0.002037,0.007418,0.043183
1319,2166,Italy,2023-01-01,0.003938,0.552944,3.000000,8.3,-0.02041,-0.585458,0.002037,0.007418,0.043183
1320,2167,Canada,2023-01-01,0.006080,0.610026,4.500000,5.4,-0.02041,-0.585458,0.002037,0.007418,0.043183


In [12]:

exog_columns = [
    "gdp_growth",
    "interest_rate",
    "unemployment_rate",
    "commodity_CRUDE_PETRO",
    "commodity_iNATGAS",
    "commodity_iAGRICULTURE",
    "commodity_iMETMIN",
    "commodity_iPRECIOUSMET",
]

stats = get_stats(
    [
        ("4qa", lambda: FourQuarterAverageModel()),
        ("ar", lambda: ARMAXModel(max_p=2, max_q=0)),
        (
            "arx",
            lambda: ARMAXModel(max_p=2, max_q=0, exogenous_columns=exog_columns),
        ),
        ("arma", lambda: ARMAXModel(max_p=2, max_q=2)),
        (
            "armax",
            lambda: ARMAXModel(max_p=2, max_q=2, exogenous_columns=exog_columns),
        ),
        (
            "uc_sv",
            lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=False),
        ),
        (
            "uc_sv_ss",
            lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=True),
        ),
        ("random_effects", lambda: RandomEffectsModel()),
        (
            "random_effects_x",
            lambda: RandomEffectsModel(exogenous_columns=exog_columns),
        ),
        ("distance", lambda: DistanceModel()),
        ("distance_x", lambda: DistanceModel(exogenous_columns=exog_columns)),
    ],
    dfs,
    tss,
    num_cores_parallel_models=1,  # only 1 model at once because models use joblib internally and nested joblib gives issues
)
stats

  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * ex

Fitting model on the full dataset... (model=<seminartools.models.uc_sv_ss_model.UCSVSSModel object at 0x7efbee88ddd0>)


100%|██████████| 12/12 [00:00<00:00, 29.39it/s]


In [None]:
# format r2 column to 2 
format_stats = stats.copy()
format_stats['p-value'] =format_stats['p-value'].astype(float)
round_cols = ["r2", "mz_intercept", "mz_slope", "mz_r2", "mz_wald_test_statistic", "p-value"]
for col in round_cols:
    format_stats[col] = format_stats[col].apply(lambda x: round(x, 3))

format_stats['mz_wald_test_statistic'] = format_stats['mz_wald_test_statistic'].astype(str) + '(' + format_stats['p-value'].astype(str) + ')'
format_stats.drop(columns = ['p-value'], inplace = True)
format_stats

Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic
4qa,4.2e-05,0.004714,0.126,0.002,0.672,0.166,24.37(0.0)
ar,5.2e-05,0.00522,-0.002,0.002,0.501,0.064,29.828(0.0)
arx,0.000181,0.008683,-2.458,0.004,0.02,0.001,1048.73(0.0)
arma,0.000276,0.006805,-4.292,0.004,0.059,0.017,1867.059(0.0)
armax,0.001863,0.023783,-34.686,0.005,-0.003,0.0,14782.553(0.0)


In [None]:
# generate latex code for the stats table
print(
    # replace _ by \_ in columns and index
    format_stats.pipe(lambda df: df.rename(columns=lambda x: x.replace("_", "\\_")))
    .pipe(lambda df: df.rename(index=lambda x: x.replace("_", "\\_")))
    .to_latex(
        index=True,
        caption="MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023",
        label="tab:model_eval",
        float_format="%.4f",
    )
)

\begin{table}
\caption{MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023}
\label{tab:model_eval}
\begin{tabular}{lrrrrrrl}
\toprule
 & mse & mae & r2 & mz\_intercept & mz\_slope & mz\_r2 & mz\_wald\_test\_statistic \\
\midrule
4qa & 0.0000 & 0.0047 & 0.1260 & 0.0020 & 0.6720 & 0.1660 & 24.37(0.0) \\
ar & 0.0001 & 0.0052 & -0.0020 & 0.0020 & 0.5010 & 0.0640 & 29.828(0.0) \\
arx & 0.0002 & 0.0087 & -2.4580 & 0.0040 & 0.0200 & 0.0010 & 1048.73(0.0) \\
arma & 0.0003 & 0.0068 & -4.2920 & 0.0040 & 0.0590 & 0.0170 & 1867.059(0.0) \\
armax & 0.0019 & 0.0238 & -34.6860 & 0.0050 & -0.0030 & 0.0000 & 14782.553(0.0) \\
\bottomrule
\end{tabular}
\end{table}

