## Problem 1

### (a)

In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

df = pd.read_csv('ps1small.csv')

df['lwage'] = np.log(df['wage'])

df['edu_age'] = df['education'].astype(str) + '_' + df['age'].astype(str)
dummies = pd.get_dummies(df['edu_age'], prefix='d')

X =  dummies
y = df['lwage']
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     2.938
Date:                Thu, 30 Oct 2025   Prob (F-statistic):           4.90e-07
Time:                        12:16:21   Log-Likelihood:                -1099.1
No. Observations:                 898   AIC:                             2258.
Df Residuals:                     868   BIC:                             2402.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
d_12_26        9.4513      0.091    104.112      0.0

### (b)

In [12]:
import statsmodels.formula.api as smf

# unrestricted: full set of educationâ€“age dummies
ur = smf.ols('lwage ~ C(education)*C(age)', data=df).fit()

# restricted: parametric linear form
r = smf.ols('lwage ~ education + age + I(education*age)', data=df).fit()

# F-test for restrictions
f_test = ur.compare_f_test(r)
print(f_test)

(0.9199279217973357, 0.5805908886458666, 26.0)


## Problem 3

### (e)

In [5]:
import pandas as pd
import numpy as np
from scipy import stats

df = pd.read_excel('wage.xlsx')
female = np.log(df.loc[df['male'] == 0, 'wage'])
male = np.log(df.loc[df['male'] == 1, 'wage'])

n1, n2 = len(female), len(male)
s1_sq, s2_sq = np.var(female, ddof=1), np.var(male, ddof=1)

T = np.sqrt(n1 * n2 / (n1 + n2)) * (np.log(s1_sq) - np.log(s2_sq))

num = (n1 + n2) * (
    ((female - np.mean(female))**4).sum() + ((male - np.mean(male))**4).sum()
)
den = (((female - np.mean(female))**2).sum() + ((male - np.mean(male))**2).sum())**2
kappa_hat = num / den

Z = T / np.sqrt(kappa_hat - 1)
p_value_robust = 1 - stats.norm.cdf(Z)

F_stat = s1_sq / s2_sq
p_value_F = 1 - stats.f.cdf(F_stat, n1 - 1, n2 - 1)

print(f"Sample sizes: n1={n1}, n2={n2}")
print(f"Sample variances: female={s1_sq:.5f}, male={s2_sq:.5f}")
print(f"T statistic = {T:.5f}")
print(f"kappa_hat = {kappa_hat:.5f}")
print(f"Z_robust = {Z:.5f}, p-value (robust test) = {p_value_robust:.5f}")
print(f"F_statistic = {F_stat:.5f}, p-value (F-test) = {p_value_F:.5f}")


Sample sizes: n1=1569, n2=1727
Sample variances: female=0.39788, male=0.36641
T statistic = 2.36210
kappa_hat = 7.37812
Z_robust = 0.93530, p-value (robust test) = 0.17482
F_statistic = 1.08587, p-value (F-test) = 0.04735
