In [None]:
# imports
import polars as pl
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import patsy

In [2]:
# importing the oj dataset
os.chdir('/Users/noaht/OneDrive/Desktop/ECON_487')
data_oj = pl.read_csv('oj.csv')

In [3]:
data_oj = data_oj.with_columns(
    pl.col('price').log().alias('log_price')
)

# Question 1

In [None]:
data_oj.describe()

In [10]:
# OLS summary stuff
model_formula = 'logmove ~ log_price*C(brand)*feat + AGE60 + EDUC + ETHNIC + ' \
'INCOME + HHLARGE + WORKWOM + HVAL150 '
pd_oj = data_oj.to_pandas()


model = smf.ols(formula=model_formula, data=pd_oj).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                logmove   R-squared:                       0.571
Model:                            OLS   Adj. R-squared:                  0.571
Method:                 Least Squares   F-statistic:                     2139.
Date:                Sat, 31 Jan 2026   Prob (F-statistic):               0.00
Time:                        11:00:25   Log-Likelihood:                -29381.
No. Observations:               28947   AIC:                         5.880e+04
Df Residuals:                   28928   BIC:                         5.896e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [None]:
# marginal effect
"""
How does Household size shift the demand curve?  
    want to quantfy the effect
    control for ommitted variable bias   
want a to use a range not unit effect to get a more meaningful interpretation
    this will help later on for elasticity measurements
"""
beta_hhlarge = model.params["HHLARGE"]

q75 = data_oj['HHLARGE'].quantile(.75)
q50 = data_oj['HHLARGE'].quantile(.5)
# predicted change in logmove when HHLARGE increases from the median to the 75th percentile
effect = beta_hhlarge * (q75 - q50)
print(effect)

-0.02846848801312334


In [8]:
# want to quantify the effect of Education as well
beta_educ = model.params["EDUC"]

q75 = data_oj['EDUC'].quantile(.75)
q50 = data_oj['EDUC'].quantile(.5)

effect = beta_educ * (q75 - q50)
print(effect)

0.025995936419361317


In [25]:
# we're gonna add the interacion terms into the model to get a more "true" effect of price change
m2_formula = 'logmove ~ log_price*C(brand)*feat + log_price:EDUC + log_price:HHLARGE + AGE60 + EDUC + ETHNIC + ' \
'INCOME + HHLARGE + WORKWOM + HVAL150'

model_2 = smf.ols(formula=m2_formula, data=pd_oj).fit()
print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:                logmove   R-squared:                       0.590
Model:                            OLS   Adj. R-squared:                  0.590
Method:                 Least Squares   F-statistic:                     2084.
Date:                Sat, 31 Jan 2026   Prob (F-statistic):               0.00
Time:                        12:30:11   Log-Likelihood:                -28713.
No. Observations:               28947   AIC:                         5.747e+04
Df Residuals:                   28926   BIC:                         5.764e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

we can see that there's now a more drastic negative effect of price increase than the first model

In [11]:
# heterogeneity matters
"""
beta_hhlarge_int measures to slope change (price elasticity)
    measuring which households will respond more or less to price changes
    postitive:bigger households are less price sensitive (more inelastic)
        the slope becomes less negative
    negative: bigger households are more pirce sensitive (more elastic)
        the slope becomes more negative (higher magnitude)

beta_hhlarge measured a shift in demand 
    How does quantity change as household size increases, holding price constant?
"""
beta_hhlarge_int = model_2.params["log_price:HHLARGE"]

q75 = data_oj['HHLARGE'].quantile(.75)
q50 = data_oj['HHLARGE'].quantile(.5)

effect = beta_hhlarge_int * (q75 - q50)
print(effect)

-0.12744696682189127


In [None]:
"""
elasticity change:
    postitive:more education -->  more elastic
        the slope becomes more positive
    negative: more education --> more inelastic
        "counteracts" beta_educ 
        the slope becomes more gets closer to zero
"""
beta_educ_int = model_2.params["log_price:EDUC"]

q75 = data_oj['EDUC'].quantile(.75)
q50 = data_oj['EDUC'].quantile(.5)

effect = beta_educ_int * (q75 - q50)
print(effect)

0.19761101058996217


# Question 2

In [21]:
# table to join
df1 = (
    data_oj
    .select(['store', 'brand', 'week', 'log_price'])
    .sort(['store', 'brand', 'week'])
    .with_columns(
        (pl.col('week')+1).alias('week'),
    )
)

In [None]:
# dataframe with lagged log price
df2 = (
    data_oj
    .join(df1, on=['brand','store','week'], how='left')
)
df2.sort(["store", "brand", "week"])

lag_price_data = (
    df2
    .rename({'log_price_right': 'lag_price'})
)

lag_price_data.sort(['store','brand','week'])

In [None]:
# gonna use the lag dataframe and see how that effects our model
lag_formula1 = "logmove ~ log_price*C(brand)*feat + lag_price*C(brand) + log_price:EDUC + log_price:HHLARGE + " \
"AGE60 + EDUC + ETHNIC + INCOME + HHLARGE + WORKWOM + HVAL150"

pd_lag1 = lag_price_data.to_pandas()

model_4 = smf.ols(formula=lag_formula1, data=pd_lag1).fit()
print(model_4.summary())

the effect of log_price is even bigger than before

## Cross Fold Validation

In [26]:
folds = KFold(n_splits=5, shuffle=True, random_state=487)

In [28]:
# storing MSE for each fold
mse = []

for train_i, test_i in folds.split(lag_price_data):
        # split the data into training and testing data
        train_data = lag_price_data[train_i].to_pandas()
        test_data = lag_price_data[test_i].to_pandas()
        
        y_train, X_train = patsy.dmatrices(lag_formula1, data=train_data, return_type='dataframe')
        y_test, X_test = patsy.dmatrices(lag_formula1, data=test_data, return_type='dataframe')
        
        
        model = sm.OLS(y_train, X_train).fit()
        # predict on the TEST set
        y_hat = model.predict(X_test)
        # MSE comparing the prediction to the true value 
        model_mse = mean_squared_error(y_test, y_hat)
        mse.append(model_mse)
    
        avg_mse = np.mean(mse)
    
        print(f"Avg MSE = {avg_mse:.2f}")
# final average
np.mean(mse)

Avg MSE = 0.41
Avg MSE = 0.42
Avg MSE = 0.42
Avg MSE = 0.42
Avg MSE = 0.42


np.float64(0.41730015510091106)

# Bonus

In [None]:
# setup for cross price data
# creating seperate columns for log_prices separated by brand
wide_data = (
    data_oj
    .select(["store", "week", "brand", "log_price"])
    .pivot(
        values="log_price",
        index=["store", "week"],
        on="brand"           
    )
    .sort(["store", "week"])
)

wide_data

In [33]:
# joining to get logmove and brand columns as well
cross_price_data = (
    data_oj
    .select(['store', 'week', 'logmove', 'brand'])
    .join(wide_data, how='left', on=['store', 'week'])
    # rename minute.maid to avoid problems in the future
    .rename({'minute.maid': 'minute_maid'})
)

In [34]:
"""
we want to see how one brand's price effects another brand's sales
    cross price elasticities
    measure substitution 
    helps with our "optimal pricing" strategy
"""
cross_price_formula = 'logmove ~ C(brand) + C(brand):tropicana + C(brand):minute_maid + C(brand):dominicks'
pd_cp_oj = cross_price_data.to_pandas()

model_5 = smf.ols(formula=cross_price_formula, data=pd_cp_oj).fit()
print(model_5.summary())

                            OLS Regression Results                            
Dep. Variable:                logmove   R-squared:                       0.449
Model:                            OLS   Adj. R-squared:                  0.449
Method:                 Least Squares   F-statistic:                     2142.
Date:                Sat, 31 Jan 2026   Prob (F-statistic):               0.00
Time:                        12:46:00   Log-Likelihood:                -33007.
No. Observations:               28947   AIC:                         6.604e+04
Df Residuals:                   28935   BIC:                         6.614e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc