In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from seminartools.models.four_quarter_average_model import FourQuarterAverageModel
from seminartools.data import read_merged

# 1. Read data 

In [3]:
# expanding window crossval
from seminartools.time_series_split import ExpandingWindowSplit

tss = ExpandingWindowSplit(start_date = "2000-01-01")
tss

<seminartools.time_series_split.ExpandingWindowSplit at 0x7f49f607b650>

In [4]:
df = read_merged(coreInf=True).reset_index()
df['country'].unique()

array(['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands'], dtype=object)

# 2. get summary stats for all models

In [5]:
from seminartools.models.utils import get_stats
from seminartools.models.four_quarter_average_model import FourQuarterAverageModel
from seminartools.models.pca_var_model import PCAVARModel
from seminartools.models.armax_model import ARMAXModel
from seminartools.models.uc_sv_ss_model import UCSVSSModel
from seminartools.models.random_effects_model import RandomEffectsModel
from seminartools.models.distance_model import DistanceModel
from seminartools.models.mucsvss_model import MUCSVSSModel

muc_model = MUCSVSSModel(num_particles=100008, stochastic_seasonality=True)
muc_model.load_from_disk("../../models/mucsvss_coreinf_model_intra_continent_100008_stochastic_all.pkl")
muc_model

I0000 00:00:1713170801.725945 3731640 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.


<seminartools.models.mucsvss_model.MUCSVSSModel at 0x7f49a535b950>

In [6]:
exog_columns = [
    "gdp_growth",
    "interest_rate",
    "unemployment_rate",
    "commodity_CRUDE_PETRO",
    "commodity_iNATGAS",
    "commodity_iAGRICULTURE",
    "commodity_iMETMIN",
    "commodity_iPRECIOUSMET",
]

stats = get_stats(
    [
        ("pca_var_2", lambda: PCAVARModel(num_pcs = 2)),
        ("pca_var_3", lambda: PCAVARModel()),
        ("pca_var_4", lambda: PCAVARModel(num_pcs = 4)),
        ("pca_var_5", lambda: PCAVARModel(num_pcs = 5)),
        ("pca_var_6", lambda: PCAVARModel(num_pcs = 6)),
        # ("4qa", lambda: FourQuarterAverageModel()),
        # #("pca_var", lambda: PCAVARModel()),
        # ("ar", lambda: ARMAXModel(max_p=2, max_q=0)),
        # (
        #     "arx",
        #     lambda: ARMAXModel(max_p=2, max_q=0, exogenous_columns=exog_columns),
        # ),
        # ("arma", lambda: ARMAXModel(max_p=2, max_q=2)),
        # (
        #     "armax",
        #     lambda: ARMAXModel(max_p=2, max_q=2, exogenous_columns=exog_columns),
        # ),
        # (
        #     "uc_sv",
        #     lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=False),
        # ),
        # (
        #     "uc_sv_ss",
        #     lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=True),
        # ),
        # ("random_effects", lambda: RandomEffectsModel()),
        # (
        #     "random_effects_x",
        #     lambda: RandomEffectsModel(exogenous_columns=exog_columns),
        # ),
        # ("distance", lambda: DistanceModel()),
        # ("distance_x", lambda: DistanceModel(exogenous_columns=exog_columns)),
        # ("mucsvss", lambda: muc_model),
    ],
    df,
    tss,
    num_cores_parallel_models=1,  # only 1 model at once because models use joblib internally and nested joblib gives issues
)
stats



Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic,p-value
pca_var_4,2.7e-05,0.003998,0.188737,0.002267,0.729971,0.314961,93.050613,8.118694387151563e-38
pca_var_3,2.9e-05,0.004109,0.138627,0.002249,0.696019,0.258225,81.422429,1.643641233966227e-33
pca_var_2,3e-05,0.00429,0.11844,0.002378,0.82981,0.256863,94.065585,3.448168236774573e-38
pca_var_5,0.000155,0.006612,-3.614142,0.00359,0.135283,0.088432,2051.191216,0.0
pca_var_6,0.001061,0.011458,-30.581731,0.00386,0.009026,0.002531,15484.244939,0.0


In [7]:
# format r2 column to 2 
format_stats = stats.copy()
format_stats['p-value'] =format_stats['p-value'].astype(float)
round_cols = ["r2", "mz_intercept", "mz_slope", "mz_r2", "mz_wald_test_statistic", "p-value"]
for col in round_cols:
    format_stats[col] = format_stats[col].apply(lambda x: round(x, 3))

format_stats['mz_wald_test_statistic'] = format_stats['mz_wald_test_statistic'].astype(str) + '(' + format_stats['p-value'].astype(str) + ')'
format_stats.drop(columns = ['p-value'], inplace = True)
format_stats

Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic
pca_var_4,2.7e-05,0.003998,0.189,0.002,0.73,0.315,93.051(0.0)
pca_var_3,2.9e-05,0.004109,0.139,0.002,0.696,0.258,81.422(0.0)
pca_var_2,3e-05,0.00429,0.118,0.002,0.83,0.257,94.066(0.0)
pca_var_5,0.000155,0.006612,-3.614,0.004,0.135,0.088,2051.191(0.0)
pca_var_6,0.001061,0.011458,-30.582,0.004,0.009,0.003,15484.245(0.0)


In [8]:
# generate latex code for the stats table
print(
    # replace _ by \_ in columns and index
    format_stats.pipe(lambda df: df.rename(columns=lambda x: x.replace("_", "\\_")))
    .pipe(lambda df: df.rename(index=lambda x: x.replace("_", "\\_")))
    .to_latex(
        index=True,
        caption="MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023",
        label="tab:model_eval",
        float_format="%.4f",
    )
)

\begin{table}
\caption{MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023}
\label{tab:model_eval}
\begin{tabular}{lrrrrrrl}
\toprule
 & mse & mae & r2 & mz\_intercept & mz\_slope & mz\_r2 & mz\_wald\_test\_statistic \\
\midrule
pca\_var\_4 & 0.0000 & 0.0040 & 0.1890 & 0.0020 & 0.7300 & 0.3150 & 93.051(0.0) \\
pca\_var\_3 & 0.0000 & 0.0041 & 0.1390 & 0.0020 & 0.6960 & 0.2580 & 81.422(0.0) \\
pca\_var\_2 & 0.0000 & 0.0043 & 0.1180 & 0.0020 & 0.8300 & 0.2570 & 94.066(0.0) \\
pca\_var\_5 & 0.0002 & 0.0066 & -3.6140 & 0.0040 & 0.1350 & 0.0880 & 2051.191(0.0) \\
pca\_var\_6 & 0.0011 & 0.0115 & -30.5820 & 0.0040 & 0.0090 & 0.0030 & 15484.245(0.0) \\
\bottomrule
\end{tabular}
\end{table}



# 3. table HCPI using the 12 countries

In [9]:
df['country'].unique()

array(['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands'], dtype=object)

In [10]:
countries = ['United States', 'Japan', 'Canada', 'Switzerland', 'France',
       'Italy', 'Portugal', 'Sweden', 'United Kingdom', 'Denmark',
       'Germany', 'Netherlands']

In [11]:
dfs = read_merged(only_countries= countries).reset_index()
dfs

Unnamed: 0,index,country,date,inflation,gdp_growth,interest_rate,unemployment_rate,commodity_CRUDE_PETRO,commodity_iNATGAS,commodity_iAGRICULTURE,commodity_iMETMIN,commodity_iPRECIOUSMET
0,0,Japan,1977-01-01,0.022951,2.187257,6.333333,2.3,0.00000,0.052425,0.144757,0.041940,0.112740
1,1,Canada,1977-01-01,0.023904,1.533830,8.166667,8.8,0.00000,0.052425,0.144757,0.041940,0.112740
2,2,United States,1977-01-01,0.017231,1.186233,4.700000,8.2,0.00000,0.052425,0.144757,0.041940,0.112740
3,3,Japan,1977-04-01,0.027244,0.703788,5.000000,2.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
4,4,Canada,1977-04-01,0.023346,0.510302,7.666667,8.0,-0.00400,0.043586,-0.066688,-0.045412,-0.060029
...,...,...,...,...,...,...,...,...,...,...,...,...
1317,2164,Netherlands,2023-01-01,-0.013667,-0.494664,3.000000,3.7,-0.02041,-0.585458,0.002037,0.007418,0.043183
1318,2165,Switzerland,2023-01-01,0.010084,0.309543,1.166667,4.4,-0.02041,-0.585458,0.002037,0.007418,0.043183
1319,2166,Italy,2023-01-01,0.003938,0.552944,3.000000,8.3,-0.02041,-0.585458,0.002037,0.007418,0.043183
1320,2167,Canada,2023-01-01,0.006080,0.610026,4.500000,5.4,-0.02041,-0.585458,0.002037,0.007418,0.043183


In [12]:

exog_columns = [
    "gdp_growth",
    "interest_rate",
    "unemployment_rate",
    "commodity_CRUDE_PETRO",
    "commodity_iNATGAS",
    "commodity_iAGRICULTURE",
    "commodity_iMETMIN",
    "commodity_iPRECIOUSMET",
]

stats = get_stats(
    [
        ("4qa", lambda: FourQuarterAverageModel()),
        ("ar", lambda: ARMAXModel(max_p=2, max_q=0)),
        (
            "arx",
            lambda: ARMAXModel(max_p=2, max_q=0, exogenous_columns=exog_columns),
        ),
        ("arma", lambda: ARMAXModel(max_p=2, max_q=2)),
        (
            "armax",
            lambda: ARMAXModel(max_p=2, max_q=2, exogenous_columns=exog_columns),
        ),
        (
            "uc_sv",
            lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=False),
        ),
        (
            "uc_sv_ss",
            lambda: UCSVSSModel(num_particles=10000, stochastic_seasonality=True),
        ),
        ("random_effects", lambda: RandomEffectsModel()),
        (
            "random_effects_x",
            lambda: RandomEffectsModel(exogenous_columns=exog_columns),
        ),
        ("distance", lambda: DistanceModel()),
        ("distance_x", lambda: DistanceModel(exogenous_columns=exog_columns)),
    ],
    dfs,
    tss,
    num_cores_parallel_models=1,  # only 1 model at once because models use joblib internally and nested joblib gives issues
)
stats

  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * exogData[i-1]
  forecast += exog_coefs[j] * ex

Fitting model on the full dataset... (model=<seminartools.models.uc_sv_ss_model.UCSVSSModel object at 0x7efbee88ddd0>)


100%|██████████| 12/12 [00:00<00:00, 29.39it/s]


Fitted!




Fitting model on the full dataset... (model=<seminartools.models.uc_sv_ss_model.UCSVSSModel object at 0x7efbeb5e55d0>)


100%|██████████| 12/12 [00:00<00:00, 62.04it/s]


Fitted!




 |████████████████████████████| 100.00% [8000/8000 00:00<00:00  Chains in warmup: 0, Divergences: 0]



 |████████████████████████████| 100.00% [8000/8000 00:01<00:00  Chains in warmup: 0, Divergences: 0]

Creating regression coefficients: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
Creating regression coefficients: 100%|██████████| 1/1 [00:00<00:00,  3.86it/s]
Creating regression coefficients: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
Creating regression coefficients: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Creating regression coefficients: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]


 |████████████████████████████| 100.00% [8000/8000 00:01<00:00  Chains in warmup: 0, Divergences: 0]

Creating regression coefficients: 100%|██████████| 9/9 [00:02<00:00,  3.85it/s]
Creating regression coefficients: 100%|██████████| 9/9 [00:02<00:00,  3.83it/s]
Creating regression coefficients: 100%|██████████| 9/9 [00:02<00:00,  3.82it/s]
Creating regression coefficients: 100%|██████████| 9/9 [00:02<00:00,  3.79it/s]
Creating regression coefficients: 100%|██████████| 9/9 [00:02<00:00,  3.80it/s]


 |████████████████████████████| 100.00% [8000/8000 00:09<00:00  Chains in warmup: 0, Divergences: 0]



Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic,p-value
uc_sv_ss,3.3e-05,0.003971,0.318768,0.001059,0.814733,0.337382,13.79243,1.237689071716356e-06
4qa,4.2e-05,0.004714,0.125978,0.001671,0.671936,0.166367,24.37011,4.6194904290365215e-11
uc_sv,4.7e-05,0.004977,0.029068,0.002137,0.568927,0.070518,21.896154,4.972290200914971e-10
ar,5.2e-05,0.00522,-0.001561,0.002097,0.501266,0.063979,29.828206,3.014255382768282e-13
random_effects,6e-05,0.005601,-0.241196,0.005846,-0.223648,0.007983,122.200818,4.501047092013103e-48
random_effects_x,6.1e-05,0.005577,-0.260044,0.004536,0.030753,0.000243,126.660062,1.2915797098649395e-49
distance,6.2e-05,0.005657,-0.274499,0.005589,-0.181915,0.006526,137.616627,2.3389188145146147e-53
distance_x,7.8e-05,0.00644,-0.609887,0.004706,-0.018445,0.000184,296.854253,2.26535763782382e-101
arx,0.000181,0.008683,-2.458343,0.004481,0.020182,0.001028,1048.769823,1.786331214115679e-230
arma,0.000276,0.006805,-4.291627,0.004273,0.058948,0.016906,1866.997323,3.94982513137e-312


In [13]:
# format r2 column to 2 
format_stats = stats.copy()
format_stats['p-value'] =format_stats['p-value'].astype(float)
round_cols = ["r2", "mz_intercept", "mz_slope", "mz_r2", "mz_wald_test_statistic", "p-value"]
for col in round_cols:
    format_stats[col] = format_stats[col].apply(lambda x: round(x, 3))

format_stats['mz_wald_test_statistic'] = format_stats['mz_wald_test_statistic'].astype(str) + '(' + format_stats['p-value'].astype(str) + ')'
format_stats.drop(columns = ['p-value'], inplace = True)

# get relative to UCSV
format_stats.loc[:,["mse","mae"]] = format_stats.loc[:,["mse","mae"]].div(format_stats.loc["uc_sv",["mse","mae"]])
format_stats

Unnamed: 0,mse,mae,r2,mz_intercept,mz_slope,mz_r2,mz_wald_test_statistic
uc_sv_ss,3.3e-05,0.003971,0.319,0.001,0.815,0.337,13.792(0.0)
4qa,4.2e-05,0.004714,0.126,0.002,0.672,0.166,24.37(0.0)
uc_sv,4.7e-05,0.004977,0.029,0.002,0.569,0.071,21.896(0.0)
ar,5.2e-05,0.00522,-0.002,0.002,0.501,0.064,29.828(0.0)
random_effects,6e-05,0.005601,-0.241,0.006,-0.224,0.008,122.201(0.0)
random_effects_x,6.1e-05,0.005577,-0.26,0.005,0.031,0.0,126.66(0.0)
distance,6.2e-05,0.005657,-0.274,0.006,-0.182,0.007,137.617(0.0)
distance_x,7.8e-05,0.00644,-0.61,0.005,-0.018,0.0,296.854(0.0)
arx,0.000181,0.008683,-2.458,0.004,0.02,0.001,1048.77(0.0)
arma,0.000276,0.006805,-4.292,0.004,0.059,0.017,1866.997(0.0)


In [14]:
# generate latex code for the stats table
print(
    # replace _ by \_ in columns and index
    format_stats.pipe(lambda df: df.rename(columns=lambda x: x.replace("_", "\\_")))
    .pipe(lambda df: df.rename(index=lambda x: x.replace("_", "\\_")))
    .to_latex(
        index=True,
        caption="MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023",
        label="tab:model_eval",
        float_format="%.4f",
    )
)

\begin{table}
\caption{MSPE, MAE, $R^2$, and Mincer-Zarnowitz statistics, out-of-sample, 2000-2023}
\label{tab:model_eval}
\begin{tabular}{lrrrrrrl}
\toprule
 & mse & mae & r2 & mz\_intercept & mz\_slope & mz\_r2 & mz\_wald\_test\_statistic \\
\midrule
uc\_sv\_ss & 0.0000 & 0.0040 & 0.3190 & 0.0010 & 0.8150 & 0.3370 & 13.792(0.0) \\
4qa & 0.0000 & 0.0047 & 0.1260 & 0.0020 & 0.6720 & 0.1660 & 24.37(0.0) \\
uc\_sv & 0.0000 & 0.0050 & 0.0290 & 0.0020 & 0.5690 & 0.0710 & 21.896(0.0) \\
ar & 0.0001 & 0.0052 & -0.0020 & 0.0020 & 0.5010 & 0.0640 & 29.828(0.0) \\
random\_effects & 0.0001 & 0.0056 & -0.2410 & 0.0060 & -0.2240 & 0.0080 & 122.201(0.0) \\
random\_effects\_x & 0.0001 & 0.0056 & -0.2600 & 0.0050 & 0.0310 & 0.0000 & 126.66(0.0) \\
distance & 0.0001 & 0.0057 & -0.2740 & 0.0060 & -0.1820 & 0.0070 & 137.617(0.0) \\
distance\_x & 0.0001 & 0.0064 & -0.6100 & 0.0050 & -0.0180 & 0.0000 & 296.854(0.0) \\
arx & 0.0002 & 0.0087 & -2.4580 & 0.0040 & 0.0200 & 0.0010 & 1048.77(0.0) \\
arma & 0.00