In [1]:
import pandas as pd
from statsmodels.regression.linear_model import OLS
import numpy as np
import scipy.stats as st

In [2]:
DATASET_URL = "https://www.ssc.wisc.edu/~bhansen/econometrics/cps09mar.txt"
DATASET_COLUMNS = ["age","female","hisp","education","earnings","hours","week","union","uncov","region","race","marital"]

In [3]:
df = pd.read_csv(DATASET_URL, sep="\t", names=DATASET_COLUMNS, index_col=False)

In [4]:
df

Unnamed: 0,age,female,hisp,education,earnings,hours,week,union,uncov,region,race,marital
0,52,0,0,12,146000,45,52,0,0,1,1,1
1,38,0,0,18,50000,45,52,0,0,1,1,1
2,38,0,0,14,32000,40,51,0,0,1,1,1
3,41,1,0,13,47000,40,52,0,0,1,1,1
4,42,0,0,13,161525,50,52,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
50737,58,1,0,11,30000,40,52,0,0,4,4,6
50738,62,1,0,16,35000,40,52,0,0,4,4,6
50739,58,0,0,12,75000,50,52,0,0,4,1,1
50740,45,1,0,12,40000,60,52,0,0,4,1,1


In [5]:
wage = df["earnings"] / (df["hours"]*df["week"])
education = df["education"]
experience = df["age"] - df["education"] - 6
exp_squared = experience*experience / 100
all_Y = wage.apply(np.log)
all_X = pd.DataFrame({"education": education, "experience": experience, "exp_squared": exp_squared, "constant":[1]*len(wage)})

In [6]:
'''Filter out subsample of white male hispanics'''
sample_filter = (df["race"] == 1) & (df["female"] == 0) & (df["hisp"] == 1) 
Y = all_Y[sample_filter]
X = all_X[sample_filter]
X

Unnamed: 0,education,experience,exp_squared,constant
112,9,38,14.44,1
267,13,2,0.04,1
271,20,14,1.96,1
460,12,9,0.81,1
461,12,37,13.69,1
...,...,...,...,...
50635,14,23,5.29,1
50643,13,39,15.21,1
50659,6,23,5.29,1
50670,6,21,4.41,1


In [7]:
ols_model = OLS(Y.values, X.values, hasconst=True)

In [8]:
results = ols_model.fit(cov_type='HC0')

In [9]:
results.summary("log_wage",["education","experience","exp_squared","constant"])

0,1,2,3
Dep. Variable:,log_wage,R-squared:,0.233
Model:,OLS,Adj. R-squared:,0.233
Method:,Least Squares,F-statistic:,372.7
Date:,"Tue, 16 Mar 2021",Prob (F-statistic):,8.79e-215
Time:,13:29:54,Log-Likelihood:,-3651.2
No. Observations:,4230,AIC:,7310.0
Df Residuals:,4226,BIC:,7336.0
Df Model:,3,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
education,0.0904,0.003,31.028,0.000,0.085,0.096
experience,0.0354,0.003,13.691,0.000,0.030,0.040
exp_squared,-0.0465,0.005,-8.767,0.000,-0.057,-0.036
constant,1.1852,0.046,25.722,0.000,1.095,1.276

0,1,2,3
Omnibus:,1562.688,Durbin-Watson:,1.77
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34837.478
Skew:,-1.229,Prob(JB):,0.0
Kurtosis:,16.843,Cond. No.,140.0


In [10]:
V_beta = results.cov_HC0
n = len(X)

In [11]:
theta = results.params[0] / (results.params[1] + results.params[2]/5)
theta

3.4683354206976524

In [12]:
G = [1/(results.params[1] + results.params[2]/5), -1*results.params[0] / ((results.params[1] + results.params[2]/5)**2), -1*(results.params[0]/5) / ((results.params[1] + results.params[2]/5)**2), 0]

In [13]:
theta_error = np.sqrt(np.matmul(np.matmul(G, V_beta), np.array(G).transpose()))
theta_error

0.22673412999594114

In [14]:
[theta - theta_error*st.norm.ppf(.95), theta + theta_error*st.norm.ppf(.95)]

[3.095390964620142, 3.841279876775163]

In [15]:
H = [12, 20, 4, 1]
est = np.matmul(H, results.params)
est_error = np.sqrt(np.matmul(np.matmul(H, V_beta), np.array(H).transpose()))
est, est_error

(2.7921668432176703, 0.011667088241661091)

In [16]:
[ est - est_error*st.norm.ppf(.975), est + est_error*st.norm.ppf(.975) ]

[2.769299770459564, 2.8150339159757767]