In [60]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print("Libraries imported successfully")

Libraries imported successfully


# Part 1: Replication - Contemporaneous Levels (Y(t) on X(t))

**Objective**: Replicate original paper's approach identifying real-time associations between beef prices and fundamentals.

**Model**: Y(t) = β₀ + β₁X(t) + ε(t)

**Limitation**: Non-stationarity in levels risks spurious correlation.

In [61]:
# Load levels data
df_clean = pd.read_csv('../data/df_clean.csv', index_col='index', parse_dates=True)

print(f"Shape: {df_clean.shape}")
print(f"Date range: {df_clean.index.min()} to {df_clean.index.max()}")
print(f"\nFirst few rows:")
df_clean.head()

Shape: (242, 18)
Date range: 2002-11-01 00:00:00 to 2022-12-01 00:00:00

First few rows:


Unnamed: 0_level_0,PMAIZMTUSDM,PBEEFUSDM,PPORKUSDM,PLAMBUSDM,PPOULTUSDM,POILBREUSDM,fao_food_index,bioethanol_production,retail_china,fnbretail_USA,D0_USA,D1_USA,D2_USA,D3_USA,D4_USA,cdd_BR,precip_BR,enso_anomaly
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2002-11-01,108.646429,81.5625,40.602105,154.961183,61.904762,24.250476,56.6,166.0,383110.0,27058.0,13.67,13.34,10.76,10.09,0.74,7.33,140.41,1.41
2002-12-01,107.012648,84.0,43.672,157.93638,61.488636,28.317273,56.0,176.0,427020.0,27440.0,11.42,9.53,12.85,10.44,0.84,5.91,218.97,1.31
2003-01-01,105.844256,85.35,46.5835,162.690205,62.253623,30.808696,56.2,177.0,408710.0,27263.0,14.97,10.42,8.95,15.26,2.0,4.71,219.03,0.7
2003-02-01,106.14108,83.75,48.164211,159.704332,63.0625,32.6625,56.9,169.0,385660.0,27165.0,14.75,10.33,10.24,14.99,1.73,4.51,207.76,0.74
2003-03-01,105.131879,83.875,48.8385,153.104803,63.84127,30.168095,55.9,175.0,364520.0,27611.0,17.11,11.86,11.82,12.34,0.46,3.88,236.61,0.45


## Step 1: Full Model

In [62]:
# Define target and regressors
y_clean = df_clean['PBEEFUSDM']

regressor_vars = ['PPORKUSDM', 'PLAMBUSDM', 'PPOULTUSDM', 'POILBREUSDM',
                  'fao_food_index', 'bioethanol_production', 'retail_china', 'fnbretail_USA',
                  'D0_USA', 'D1_USA', 'D2_USA', 'D3_USA', 'D4_USA',
                  'cdd_BR', 'precip_BR', 'enso_anomaly']

# Contemporaneous: X(t) predicts Y(t)
X_clean = df_clean[regressor_vars]
X_clean = sm.add_constant(X_clean)

# Train-test split
X_train = X_clean.iloc[:200]
X_test = X_clean.iloc[200:]
y_train = y_clean.iloc[:200]
y_test = y_clean.iloc[200:]

# Run OLS
model1_full = sm.OLS(y_train, X_train).fit()
print("="*80)
print("MODEL 1: CONTEMPORANEOUS LEVELS (Y(t) on X(t)) - FULL MODEL")
print("="*80)
print(model1_full.summary())

MODEL 1: CONTEMPORANEOUS LEVELS (Y(t) on X(t)) - FULL MODEL
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.885
Model:                            OLS   Adj. R-squared:                  0.875
Method:                 Least Squares   F-statistic:                     87.97
Date:                Tue, 02 Dec 2025   Prob (F-statistic):           7.28e-77
Time:                        15:17:26   Log-Likelihood:                -812.59
No. Observations:                 200   AIC:                             1659.
Df Residuals:                     183   BIC:                             1715.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

## Step 2: Multicollinearity Diagnostics

In [63]:
# VIF Analysis (exclude constant)
vif_data = pd.DataFrame()
vif_data["Variable"] = X_train.columns[1:]  # Exclude constant
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(1, X_train.shape[1])]
vif_data = vif_data.sort_values('VIF', ascending=False)

print("VIF (>10 = multicollinearity)")
print(vif_data)
print(f"\nCondition number: {np.linalg.cond(X_train):.2f}")

VIF (>10 = multicollinearity)
                 Variable        VIF
6            retail_china  57.774775
7           fnbretail_USA  53.307153
5   bioethanol_production  25.459849
4          fao_food_index  16.324934
2              PPOULTUSDM  12.321875
14              precip_BR  10.784678
13                 cdd_BR  10.735751
3             POILBREUSDM  10.135622
10                 D2_USA   5.226190
1               PLAMBUSDM   4.704289
11                 D3_USA   4.255431
9                  D1_USA   3.400054
0               PPORKUSDM   2.405659
12                 D4_USA   2.233180
8                  D0_USA   2.136614
15           enso_anomaly   1.787406

Condition number: 50271876.73


In [64]:
# Correlation Analysis
corr_matrix = X_train.corr()
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

print("High Correlations (|r| > 0.7)")
for var1, var2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
    print(f"{var1:25} <-> {var2:25} : {corr:7.3f}")

High Correlations (|r| > 0.7)
retail_china              <-> fnbretail_USA             :   0.983
cdd_BR                    <-> precip_BR                 :  -0.949
PPOULTUSDM                <-> fnbretail_USA             :   0.945
PPOULTUSDM                <-> retail_china              :   0.930
bioethanol_production     <-> retail_china              :   0.893
POILBREUSDM               <-> fao_food_index            :   0.884
bioethanol_production     <-> fnbretail_USA             :   0.872
PPOULTUSDM                <-> bioethanol_production     :   0.845
PLAMBUSDM                 <-> retail_china              :  -0.807
PLAMBUSDM                 <-> fnbretail_USA             :  -0.774
D2_USA                    <-> D3_USA                    :   0.755
PLAMBUSDM                 <-> bioethanol_production     :  -0.728
PLAMBUSDM                 <-> PPOULTUSDM                :  -0.723


## Step 3: Reduced Model (Remove Collinear Variables)

**Note**: VIF and correlation diagnostics will identify multicollinearity issues. Based on previous analysis, expect high VIF for retail variables, bioethanol, fao_food_index, drought categories, and potentially climate variables.

**Strategy**: Remove variables with VIF > 10 and high pairwise correlations (|r| > 0.7), prioritizing retention of theoretically important variables.

In [78]:
# Reduced regressor set based on correlation analysis
# Removed based on high correlations (|r| > 0.7):
# - retail_china (corr=0.983 with fnbretail_USA, keep US retail)
# - PPOULTUSDM (corr=0.945 with fnbretail_USA, 0.930 with retail_china)
# - bioethanol_production (corr=0.893 with retail_china, 0.872 with fnbretail_USA)
# - fao_food_index (corr=0.884 with POILBREUSDM, keep oil as more fundamental)
# - cdd_BR (corr=-0.949 with precip_BR, keep precipitation)
# - D3_USA (corr=0.755 with D2_USA, removed due to high correlation)
# Keep D0, D1, D2, D4 as separate drought severity indicators

regressor_vars_reduced = ['PPORKUSDM', 'PLAMBUSDM', 'POILBREUSDM', 'fnbretail_USA',
                          'D0_USA', 'D1_USA', 'D4_USA',
                          'precip_BR', 'enso_anomaly']

X_clean_reduced = df_clean[regressor_vars_reduced]
X_clean_reduced = sm.add_constant(X_clean_reduced)

X_train_reduced = X_clean_reduced.iloc[:200]
X_test_reduced = X_clean_reduced.iloc[200:]

# Run reduced model
model1_reduced = sm.OLS(y_train, X_train_reduced).fit()
print("="*80)
print("MODEL 1: REDUCED MODEL (REMOVED COLLINEAR VARIABLES)")
print("="*80)
print(model1_reduced.summary())

MODEL 1: REDUCED MODEL (REMOVED COLLINEAR VARIABLES)
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.853
Method:                 Least Squares   F-statistic:                     129.3
Date:                Tue, 02 Dec 2025   Prob (F-statistic):           4.42e-76
Time:                        15:29:47   Log-Likelihood:                -832.45
No. Observations:                 200   AIC:                             1685.
Df Residuals:                     190   BIC:                             1718.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

In [79]:
# Verify VIF improved (exclude constant)
vif_reduced = pd.DataFrame()
vif_reduced["Variable"] = X_train_reduced.columns[1:]  # Exclude constant
vif_reduced["VIF"] = [variance_inflation_factor(X_train_reduced.values, i) for i in range(1, X_train_reduced.shape[1])]
vif_reduced = vif_reduced.sort_values('VIF', ascending=False)

print("VIF - Reduced Model")
print(vif_reduced)
print(f"\nAll VIF < 10? {(vif_reduced['VIF'] < 10).all()}")
print(f"Condition number: {np.linalg.cond(X_train_reduced):.2f}")

VIF - Reduced Model
        Variable       VIF
1      PLAMBUSDM  3.435111
3  fnbretail_USA  3.181669
2    POILBREUSDM  2.430780
0      PPORKUSDM  2.023928
8   enso_anomaly  1.522462
6         D4_USA  1.505573
5         D1_USA  1.475424
4         D0_USA  1.451506
7      precip_BR  1.152923

All VIF < 10? True
Condition number: 864339.43


## Step 4: Discussion of Results

### Comparison: Full Model vs. Reduced Model

**Full Model Issues:**
- Severe multicollinearity (VIF >50 for retail variables)
- Unstable coefficients due to high condition number
- Unreliable standard errors and p-values
- Risk of spurious significance due to collinear variables

**Reduced Model Improvements:**
- All VIF < 10 (multicollinearity resolved)
- Stable, interpretable coefficients
- More reliable hypothesis tests

### Economic Interpretation of Significant Predictors

**Important Caveat**: These are **associations in non-stationary levels data** - coefficients show contemporaneous correlations, not causal effects. Non-stationarity means variables may appear significant due to common trends (e.g., inflation, economic growth) rather than true economic relationships. First-difference models (Part 2) address this limitation.



# Part 2: First-Difference Models (Addressing Non-Stationarity)

**Motivation**: Transform to first-differences to remove trends and achieve stationarity.

**Model 2**: ΔY(t) = β₀ + β₁ΔX(t) + ε(t) - Contemporaneous changes
**Model 3**: ΔY(t) = β₀ + β₁ΔX(t-1) + ε(t) - Lagged changes (prediction)

**Advantage**: Eliminates spurious correlation from common trends.

In [68]:
# Load first-differenced data
df_firstdiff = pd.read_csv('../data/df_firstdiff.csv', index_col='index', parse_dates=True)

print(f"Shape: {df_firstdiff.shape}")
print(f"Date range: {df_firstdiff.index.min()} to {df_firstdiff.index.max()}")
print(f"\nFirst few rows:")
df_firstdiff.head()

Shape: (241, 18)
Date range: 2002-12-01 00:00:00 to 2022-12-01 00:00:00

First few rows:


Unnamed: 0_level_0,PMAIZMTUSDM,PBEEFUSDM,PPORKUSDM,PLAMBUSDM,PPOULTUSDM,POILBREUSDM,fao_food_index,bioethanol_production,retail_china,fnbretail_USA,D0_USA,D1_USA,D2_USA,D3_USA,D4_USA,cdd_BR,precip_BR,enso_anomaly
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2002-12-01,-0.015152,0.029447,43.672,0.019018,-0.006745,28.317273,-0.010657,0.058496,0.108509,0.014019,11.42,9.53,12.85,10.44,0.84,5.91,218.97,1.31
2003-01-01,-0.010978,0.015944,46.5835,0.029656,0.012364,30.808696,0.003565,0.005666,-0.043825,-0.006471,14.97,10.42,8.95,15.26,2.0,4.71,219.03,0.7
2003-02-01,0.0028,-0.018924,48.164211,-0.018524,0.01291,32.6625,0.012379,-0.046251,-0.05805,-0.003601,14.75,10.33,10.24,14.99,1.73,4.51,207.76,0.74
2003-03-01,-0.009554,0.001491,48.8385,-0.042202,0.012274,30.168095,-0.017731,0.034887,-0.056375,0.016285,17.11,11.86,11.82,12.34,0.46,3.88,236.61,0.45
2003-04-01,0.00196,-0.015015,49.207143,-0.026875,-0.004764,25.035,0.0,0.0226,-0.030641,0.002424,21.05,13.13,12.53,9.09,0.2,6.7,188.09,-0.05


## Model 2: Contemporaneous Changes (ΔY(t) on ΔX(t))

Tests whether current changes in fundamentals correlate with current beef price changes.

In [69]:
# Full model - contemporaneous (first-differences)
y_diff = df_firstdiff['PBEEFUSDM']

regressor_vars_diff = ['PPORKUSDM', 'PLAMBUSDM', 'PPOULTUSDM', 'POILBREUSDM',
                       'fao_food_index', 'bioethanol_production', 'retail_china', 'fnbretail_USA',
                       'D0_USA', 'D1_USA', 'D2_USA', 'D3_USA', 'D4_USA',
                       'cdd_BR', 'precip_BR', 'enso_anomaly']

X_diff_contemp = df_firstdiff[regressor_vars_diff]
X_diff_contemp = sm.add_constant(X_diff_contemp)

# Train-test split
X_train_contemp = X_diff_contemp.iloc[:200]
X_test_contemp = X_diff_contemp.iloc[200:]
y_train_diff = y_diff.iloc[:200]
y_test_diff = y_diff.iloc[200:]

# Run OLS
model2_full = sm.OLS(y_train_diff, X_train_contemp).fit()
print("="*80)
print("MODEL 2: CONTEMPORANEOUS FIRST-DIFFERENCES (ΔY(t) on ΔX(t)) - FULL MODEL")
print("="*80)
print(model2_full.summary())

MODEL 2: CONTEMPORANEOUS FIRST-DIFFERENCES (ΔY(t) on ΔX(t)) - FULL MODEL
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     1.426
Date:                Tue, 02 Dec 2025   Prob (F-statistic):              0.133
Time:                        15:23:09   Log-Likelihood:                 356.90
No. Observations:                 200   AIC:                            -679.8
Df Residuals:                     183   BIC:                            -623.7
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------

In [70]:
# VIF for Model 2 (exclude constant)
vif_diff = pd.DataFrame()
vif_diff["Variable"] = X_train_contemp.columns[1:]  # Exclude constant
vif_diff["VIF"] = [variance_inflation_factor(X_train_contemp.values, i) for i in range(1, X_train_contemp.shape[1])]
vif_diff = vif_diff.sort_values('VIF', ascending=False)

print("VIF - Model 2")
print(vif_diff)

VIF - Model 2
                 Variable        VIF
14              precip_BR  11.638633
13                 cdd_BR  11.523958
10                 D2_USA   4.885769
11                 D3_USA   4.117065
9                  D1_USA   3.223181
3             POILBREUSDM   2.632517
0               PPORKUSDM   2.160816
12                 D4_USA   1.988864
8                  D0_USA   1.694004
15           enso_anomaly   1.477739
1               PLAMBUSDM   1.322915
4          fao_food_index   1.312493
6            retail_china   1.209796
2              PPOULTUSDM   1.193681
5   bioethanol_production   1.090431
7           fnbretail_USA   1.049536


In [71]:
# Correlation for Model 2
corr_diff = X_train_contemp.corr()
high_corr_diff = []
for i in range(len(corr_diff.columns)):
    for j in range(i+1, len(corr_diff.columns)):
        if abs(corr_diff.iloc[i, j]) > 0.7:
            high_corr_diff.append((corr_diff.columns[i], corr_diff.columns[j], corr_diff.iloc[i, j]))

print("High Correlations (|r| > 0.7)")
if high_corr_diff:
    for var1, var2, corr in sorted(high_corr_diff, key=lambda x: abs(x[2]), reverse=True):
        print(f"{var1:25} <-> {var2:25} : {corr:7.3f}")
else:
    print("None found")

High Correlations (|r| > 0.7)
cdd_BR                    <-> precip_BR                 :  -0.950
D2_USA                    <-> D3_USA                    :   0.758


### Model 2 Multicollinearity Assessment

Review VIF and correlation results above to identify problematic variables. Remove those with VIF > 10 or high pairwise correlations.

### Reduced Model 2

In [73]:
# Reduced Model 2: Remove only climate variables with high correlation
# Removed: cdd_BR and precip_BR (corr=-0.949)
# Keep all other variables including all drought indicators, retail, and food variables
regressor_vars_diff_reduced = ['PPORKUSDM', 'PLAMBUSDM', 'PPOULTUSDM', 'POILBREUSDM',
                               'fao_food_index', 'bioethanol_production', 'retail_china', 'fnbretail_USA',
                               'D0_USA', 'D1_USA', 'D2_USA', 'D3_USA', 'D4_USA',
                               'enso_anomaly']

X_diff_contemp_reduced = df_firstdiff[regressor_vars_diff_reduced]
X_diff_contemp_reduced = sm.add_constant(X_diff_contemp_reduced)

X_train_contemp_reduced = X_diff_contemp_reduced.iloc[:200]
X_test_contemp_reduced = X_diff_contemp_reduced.iloc[200:]

model2_reduced = sm.OLS(y_train_diff, X_train_contemp_reduced).fit()
print("="*80)
print("MODEL 2: REDUCED MODEL (REMOVED COLLINEAR VARIABLES)")
print("="*80)
print(model2_reduced.summary())

MODEL 2: REDUCED MODEL (REMOVED COLLINEAR VARIABLES)
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     1.591
Date:                Tue, 02 Dec 2025   Prob (F-statistic):             0.0848
Time:                        15:24:58   Log-Likelihood:                 356.52
No. Observations:                 200   AIC:                            -683.0
Df Residuals:                     185   BIC:                            -633.6
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------

## Model 3: Lagged Changes (ΔY(t) on ΔX(t-1))

**Key Question**: Can past changes predict current changes? This tests true predictive power.

In [74]:
# Full model - lagged (first-differences)
X_diff_lagged = df_firstdiff[regressor_vars_diff].shift(1)
X_diff_lagged = sm.add_constant(X_diff_lagged)

# Drop NaN from lagging
valid_idx = X_diff_lagged.dropna().index
X_diff_lagged_clean = X_diff_lagged.loc[valid_idx]
y_diff_lagged_clean = y_diff.loc[valid_idx]

# Train-test split
X_train_lagged = X_diff_lagged_clean.iloc[:200]
X_test_lagged = X_diff_lagged_clean.iloc[200:]
y_train_lagged = y_diff_lagged_clean.iloc[:200]
y_test_lagged = y_diff_lagged_clean.iloc[200:]

# Run OLS
model3_full = sm.OLS(y_train_lagged, X_train_lagged).fit()
print("="*80)
print("MODEL 3: LAGGED FIRST-DIFFERENCES (ΔY(t) on ΔX(t-1)) - FULL MODEL")
print("="*80)
print(model3_full.summary())

MODEL 3: LAGGED FIRST-DIFFERENCES (ΔY(t) on ΔX(t-1)) - FULL MODEL
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                 -0.029
Method:                 Least Squares   F-statistic:                    0.6512
Date:                Tue, 02 Dec 2025   Prob (F-statistic):              0.838
Time:                        15:25:16   Log-Likelihood:                 350.84
No. Observations:                 200   AIC:                            -667.7
Df Residuals:                     183   BIC:                            -611.6
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------

**Note**: Using the same reduced variable set as Models 1 and 2 for consistency (based on correlation analysis from Model 1).

## Reduced Model 3

In [75]:
# Use same reduced variable set as Models 1 and 2
X_diff_lagged_reduced = df_firstdiff[regressor_vars_diff_reduced].shift(1)
X_diff_lagged_reduced = sm.add_constant(X_diff_lagged_reduced)

# Drop NaN from lagging
valid_idx_reduced = X_diff_lagged_reduced.dropna().index
X_diff_lagged_reduced_clean = X_diff_lagged_reduced.loc[valid_idx_reduced]
y_diff_lagged_reduced_clean = y_diff.loc[valid_idx_reduced]

# Train-test split
X_train_lagged_reduced = X_diff_lagged_reduced_clean.iloc[:200]
X_test_lagged_reduced = X_diff_lagged_reduced_clean.iloc[200:]
y_train_lagged_reduced = y_diff_lagged_reduced_clean.iloc[:200]
y_test_lagged_reduced = y_diff_lagged_reduced_clean.iloc[200:]

model3_reduced = sm.OLS(y_train_lagged_reduced, X_train_lagged_reduced).fit()
print("="*80)
print("MODEL 3: REDUCED MODEL (LAGGED FIRST-DIFFERENCES - ΔY(t) on ΔX(t-1))")
print("="*80)
print(model3_reduced.summary())

MODEL 3: REDUCED MODEL (LAGGED FIRST-DIFFERENCES - ΔY(t) on ΔX(t-1))
                            OLS Regression Results                            
Dep. Variable:              PBEEFUSDM   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                 -0.018
Method:                 Least Squares   F-statistic:                    0.7425
Date:                Tue, 02 Dec 2025   Prob (F-statistic):              0.730
Time:                        15:25:20   Log-Likelihood:                 350.77
No. Observations:                 200   AIC:                            -671.5
Df Residuals:                     185   BIC:                            -622.1
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------

# Summary: Model Comparison

Compare all three models to understand:
- What correlates with beef prices (Model 1)
- What correlates with beef price changes (Model 2)
- What predicts beef price changes (Model 3)

# Model Evaluation on Test Set

Evaluate all three reduced models on held-out test data (last 20% of observations) to assess out-of-sample performance.

In [76]:
# Test set predictions and evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Model 1: Levels
y_pred_m1 = model1_reduced.predict(X_test_reduced)
mse_m1 = mean_squared_error(y_test, y_pred_m1)
rmse_m1 = np.sqrt(mse_m1)
mae_m1 = mean_absolute_error(y_test, y_pred_m1)
r2_m1 = r2_score(y_test, y_pred_m1)

# Model 2: Contemporaneous Changes
y_pred_m2 = model2_reduced.predict(X_test_contemp_reduced)
mse_m2 = mean_squared_error(y_test_diff, y_pred_m2)
rmse_m2 = np.sqrt(mse_m2)
mae_m2 = mean_absolute_error(y_test_diff, y_pred_m2)
r2_m2 = r2_score(y_test_diff, y_pred_m2)

# Model 3: Lagged Changes
y_pred_m3 = model3_reduced.predict(X_test_lagged_reduced)
mse_m3 = mean_squared_error(y_test_lagged_reduced, y_pred_m3)
rmse_m3 = np.sqrt(mse_m3)
mae_m3 = mean_absolute_error(y_test_lagged_reduced, y_pred_m3)
r2_m3 = r2_score(y_test_lagged_reduced, y_pred_m3)

# Display results
print("="*80)
print("TEST SET PERFORMANCE EVALUATION")
print("="*80)
print(f"\n{'Model':<45} {'RMSE':<12} {'MAE':<12} {'R²':<10}")
print("-"*80)
print(f"{'Model 1 (Levels, Y(t) on X(t))':<45} {rmse_m1:>10.4f}  {mae_m1:>10.4f}  {r2_m1:>8.4f}")
print(f"{'Model 2 (Changes, ΔY(t) on ΔX(t))':<45} {rmse_m2:>10.4f}  {mae_m2:>10.4f}  {r2_m2:>8.4f}")
print(f"{'Model 3 (Lagged, ΔY(t) on ΔX(t-1))':<45} {rmse_m3:>10.4f}  {mae_m3:>10.4f}  {r2_m3:>8.4f}")

# Naive baseline comparison (for first-difference models)
# Baseline: predict no change (ΔY = 0)
baseline_rmse_m2 = np.sqrt(mean_squared_error(y_test_diff, np.zeros(len(y_test_diff))))
baseline_mae_m2 = mean_absolute_error(y_test_diff, np.zeros(len(y_test_diff)))

baseline_rmse_m3 = np.sqrt(mean_squared_error(y_test_lagged_reduced, np.zeros(len(y_test_lagged_reduced))))
baseline_mae_m3 = mean_absolute_error(y_test_lagged_reduced, np.zeros(len(y_test_lagged_reduced)))

print(f"\n{'NAIVE BASELINE (predict ΔY = 0):':<45}")
print(f"{'  Model 2 baseline':<45} {baseline_rmse_m2:>10.4f}  {baseline_mae_m2:>10.4f}  {0.0:>8.4f}")
print(f"{'  Model 3 baseline':<45} {baseline_rmse_m3:>10.4f}  {baseline_mae_m3:>10.4f}  {0.0:>8.4f}")

print(f"\n{'IMPROVEMENT OVER BASELINE:':<45}")
print(f"{'  Model 2 RMSE reduction':<45} {(1 - rmse_m2/baseline_rmse_m2)*100:>9.2f}%")
print(f"{'  Model 3 RMSE reduction':<45} {(1 - rmse_m3/baseline_rmse_m3)*100:>9.2f}%")

TEST SET PERFORMANCE EVALUATION

Model                                         RMSE         MAE          R²        
--------------------------------------------------------------------------------
Model 1 (Levels, Y(t) on X(t))                   37.7609     30.3551   -0.8839
Model 2 (Changes, ΔY(t) on ΔX(t))                 0.0596      0.0403   -0.2153
Model 3 (Lagged, ΔY(t) on ΔX(t-1))                0.0682      0.0451   -0.5536

NAIVE BASELINE (predict ΔY = 0):             
  Model 2 baseline                                0.0540      0.0351    0.0000
  Model 3 baseline                                0.0547      0.0359    0.0000

IMPROVEMENT OVER BASELINE:                   
  Model 2 RMSE reduction                         -10.22%
  Model 3 RMSE reduction                         -24.62%


## Interpretation of Test Set Results


In [77]:
# Model comparison
print("="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)

models = {
    'Model 1 (Levels, Y(t) on X(t))': model1_reduced,
    'Model 2 (Changes, ΔY(t) on ΔX(t))': model2_reduced,
    'Model 3 (Lagged, ΔY(t) on ΔX(t-1))': model3_reduced
}

print(f"\n{'Model':<40} {'R²':<10} {'Adj R²':<10} {'F p-val':<10}")
print("-"*80)
for name, model in models.items():
    print(f"{name:<40} {model.rsquared:>8.4f} {model.rsquared_adj:>10.4f} {model.f_pvalue:>10.4e}")

print("\n" + "="*80)
print("KEY INSIGHT")
print("="*80)
print("Compare R² and coefficient changes across models to assess:")
print("  - Impact of non-stationarity (Model 1 vs Models 2/3)")
print("  - Contemporaneous vs. predictive relationships (Model 2 vs Model 3)")
print("  - Economic interpretation of results")

MODEL COMPARISON SUMMARY

Model                                    R²         Adj R²     F p-val   
--------------------------------------------------------------------------------
Model 1 (Levels, Y(t) on X(t))             0.8620     0.8547 1.0740e-75
Model 2 (Changes, ΔY(t) on ΔX(t))          0.1075     0.0399 8.4796e-02
Model 3 (Lagged, ΔY(t) on ΔX(t-1))         0.0532    -0.0185 7.2955e-01

KEY INSIGHT
Compare R² and coefficient changes across models to assess:
  - Impact of non-stationarity (Model 1 vs Models 2/3)
  - Contemporaneous vs. predictive relationships (Model 2 vs Model 3)
  - Economic interpretation of results
