In [142]:
import pyreadstat  
import pandas as pd
import numpy as np  
import statsmodels.api as sm
from helper import *
dataframe, meta = pyreadstat.read_dta('morg79.dta')

# Video 1

In [143]:
dataframe['gradecp']

0         2
1         1
2         2
3         1
4         1
         ..
328401    2
328402    1
328403    1
328404    2
328405    1
Name: gradecp, Length: 328406, dtype: int64

In [144]:
dataframe.columns

Index(['minsamp', 'intmonth', 'hhid', 'state', 'smsarank', 'hhnum', 'activlwr',
       'hourslw', 'reasonlw', 'absentlw', 'classer', 'ind70', 'occ70',
       'lineno', 'relahh', 'age', 'marital', 'race', 'sex', 'veteran',
       'gradeat', 'gradecp', 'esr', 'weight', 'smsastat', 'centcity', 'ethnic',
       'ptstat', 'ftpt79', 'docc70', 'doinglw', 'hourslwa', 'uhours35',
       'why35lw', 'class', 'uhours', 'paidhr', 'earnhr', 'uearnwk', 'earnwt',
       'eligible', 'uhourse', 'paidhre', 'earnhre', 'earnwke', 'I25a', 'I25b',
       'I25c', 'I25d', 'uearnwke', 'year', 'smsa70', 'dind'],
      dtype='object')

In [145]:
dataframe['uhours']

0          40
1          40
2         NaN
3         NaN
4          40
         ... 
328401    NaN
328402     40
328403    NaN
328404    NaN
328405    NaN
Name: uhours, Length: 328406, dtype: object

In [146]:
# remove all NaN values and 0 for the uhours column
dataframe = dataframe.dropna(subset=['uhours'])
dataframe = dataframe.dropna(subset=['earnwke'])
dataframe = dataframe[dataframe['uhours'] != 0]
dataframe = dataframe[dataframe['uhours'] != 0]
# make sure to compare the right row entries so the division makes sense
dataframe['hrwage'] = dataframe['earnwke'] / dataframe['uhours']

In [147]:
summarize(dataframe['hrwage'])

Num observations:  167565
Mean:  6.098992663916908
Median:  5.0
Standard Deviation:  4.266460415857638
Variance:  18.202684480080126
25th percentile:  3.5
50th percentile:  5.0
75th percentile:  7.514285714285714
Skewness:  21.070557491609996
Kurtosis:  1686.5797055425337
Min:  0.03333333333333333
Max:  500.0


In [148]:
dataframe['race']

0         1
1         1
4         1
6         1
7         1
         ..
328388    1
328389    1
328390    1
328391    1
328402    3
Name: race, Length: 167565, dtype: int64

In [149]:
# look at race, where race is 1 for white
# sum the values with race == 1
white = dataframe['race'].values == 1
black = dataframe['race'].values == 2
hispanic = dataframe['race'].values == 3

In [150]:
hispanic.sum()

4459

In [151]:
hrearnwt = dataframe['earnwt'] * dataframe['uhours']
dataframe['hrearnwt'] = hrearnwt

In [152]:
summarize(hrearnwt)

Num observations:  167565
Mean:  233780.32185387824
Median:  251810.8037109375
Standard Deviation:  117174.66850211626
Variance:  13729902938.580835
25th percentile:  146587.1953125
50th percentile:  251810.8037109375
75th percentile:  299838.80859375
Skewness:  0.7797426670293288
Kurtosis:  5.5944302269080755
Min:  441.19998931884766
Max:  2302304.4140625


In [153]:
# look at distribution of hrearnwt for white, black, and hispanic
summarize(dataframe['hrwage'][white])

Num observations:  148095
Mean:  6.1905374465650596
Median:  5.2
Standard Deviation:  4.1525473329644775
Variance:  17.2436493525104
25th percentile:  3.525
50th percentile:  5.2
75th percentile:  7.75
Skewness:  13.520667666961137
Kurtosis:  740.1086620875421
Min:  0.03333333333333333
Max:  369.0


In [154]:
summarize(dataframe['hrwage'][black])

Num observations:  15011
Mean:  5.220620553434161
Median:  4.375
Standard Deviation:  5.2928734593595
Variance:  28.0145094567922
25th percentile:  3.1096096096096097
50th percentile:  4.375
75th percentile:  6.294736842105263
Skewness:  58.61353857885214
Kurtosis:  5215.76191945734
Min:  0.23333333333333334
Max:  500.0


In [155]:
summarize(dataframe['hrwage'][hispanic])

Num observations:  4459
Mean:  6.015547757919675
Median:  5.0
Standard Deviation:  3.761691125157011
Variance:  14.150320121085018
25th percentile:  3.5
50th percentile:  5.0
75th percentile:  7.5
Skewness:  3.4326062830296764
Kurtosis:  33.544705861690744
Min:  0.23333333333333334
Max:  74.0


In [156]:
# get sex
male = dataframe['sex']==1
female = dataframe['sex']==2

In [157]:
# male and white
summarize(dataframe['hrwage'][white & male])

Num observations:  83781
Mean:  7.295179263927489
Median:  6.555555555555555
Standard Deviation:  4.51645764087981
Variance:  20.398389621861618
25th percentile:  4.5
50th percentile:  6.555555555555555
75th percentile:  9.0
Skewness:  13.952536799682152
Kurtosis:  780.6198066010592
Min:  0.03333333333333333
Max:  369.0


In [158]:
dataframe['esr']==1

0         True
1         True
4         True
6         True
7         True
          ... 
328388    True
328389    True
328390    True
328391    True
328402    True
Name: esr, Length: 167565, dtype: bool

In [159]:
condition = white & male 
summarize(dataframe['hrwage'][condition ], weight=dataframe['hrearnwt'][condition])

Num observations:  83781
Weighted Mean:  7.364672538755782
Median:  6.555555555555555
Weighted Standard Deviation:  3.8032694522271666
Weighted Variance:  14.464858526244331
25th percentile:  4.5
50th percentile:  6.555555555555555
75th percentile:  9.0
Skewness:  13.952536799682152
Kurtosis:  780.6198066010592
Min:  0.03333333333333333
Max:  369.0


In [160]:
summarize(dataframe['esr'])

Num observations:  167565
Mean:  1.0580192761018112
Median:  1.0
Standard Deviation:  0.23378059343028038
Variance:  0.054653365864614056
25th percentile:  1.0
50th percentile:  1.0
75th percentile:  1.0
Skewness:  3.781203502057148
Kurtosis:  12.297646704439746
Min:  1
Max:  2


In [161]:
# replace values < 2$ per hour with NaN
dataframe.loc[dataframe['hrwage'] < 2, 'hrwage'] = np.nan

In [162]:
dataframe.columns

Index(['minsamp', 'intmonth', 'hhid', 'state', 'smsarank', 'hhnum', 'activlwr',
       'hourslw', 'reasonlw', 'absentlw', 'classer', 'ind70', 'occ70',
       'lineno', 'relahh', 'age', 'marital', 'race', 'sex', 'veteran',
       'gradeat', 'gradecp', 'esr', 'weight', 'smsastat', 'centcity', 'ethnic',
       'ptstat', 'ftpt79', 'docc70', 'doinglw', 'hourslwa', 'uhours35',
       'why35lw', 'class', 'uhours', 'paidhr', 'earnhr', 'uearnwk', 'earnwt',
       'eligible', 'uhourse', 'paidhre', 'earnhre', 'earnwke', 'I25a', 'I25b',
       'I25c', 'I25d', 'uearnwke', 'year', 'smsa70', 'dind', 'hrwage',
       'hrearnwt'],
      dtype='object')

In [163]:
summarize(dataframe['gradecp'])

Num observations:  167565
Mean:  1.2141676364395906
Median:  1.0
Standard Deviation:  0.41024488337209725
Variance:  0.1683008643329857
25th percentile:  1.0
50th percentile:  1.0
75th percentile:  1.0
Skewness:  1.393488556646848
Kurtosis:  -0.05819033717721922
Min:  1
Max:  2


In [164]:
edyears = dataframe['gradeat']
# if gradecp==2, then subtract 1 from edyears
edyears = np.where(dataframe['gradecp']==2, edyears-1, edyears)
# if < 0, then set to 0
edyears = np.where(edyears < 0, 0, edyears)
dataframe['edyears'] = edyears
summarize(edyears)

Num observations:  167565
Mean:  12.43587861426909
Median:  12.0
Standard Deviation:  2.813188343559732
Variance:  7.914028656340349
25th percentile:  12.0
50th percentile:  12.0
75th percentile:  14.0
Skewness:  -0.45452390389718256
Kurtosis:  1.5934761522954393
Min:  0
Max:  18


In [165]:
# get potential experience
potexp = dataframe['age'] - edyears - 6
# if potexp < 0, set to 0
potexp = np.where(potexp < 0, 0, potexp)
# add to dataframe
dataframe['potexp'] = potexp
summarize(potexp)

Num observations:  167565
Mean:  18.007644794557336
Median:  15.0
Standard Deviation:  14.517285036022871
Variance:  210.75156481713358
25th percentile:  5.0
50th percentile:  15.0
75th percentile:  29.0
Skewness:  0.632317091622861
Kurtosis:  -0.5653835184926255
Min:  0
Max:  87


In [166]:
potexp2 = potexp**2
dataframe['potexp2'] = potexp2

In [167]:
# hrwage if january and working
summarize(dataframe['hrwage'][dataframe['esr']==1 & (dataframe['intmonth']==1)])

Num observations:  13327
Mean:  6.006270829270184
Median:  5.0
Standard Deviation:  3.9798656764440903
Variance:  15.839330802537775
25th percentile:  3.5
50th percentile:  5.0
75th percentile:  7.5
Skewness:  9.509127446103788
Kurtosis:  270.2898206309136
Min:  2.0
Max:  150.0


In [168]:
# hrwage if january and working
summarize(dataframe['hrwage'][dataframe['esr']==1 & (dataframe['intmonth']==12)])

Num observations:  13971
Mean:  6.400493990752086
Median:  5.35
Standard Deviation:  4.050269910019446
Variance:  16.40468634400893
25th percentile:  3.75
50th percentile:  5.35
75th percentile:  8.0
Skewness:  10.107670614203371
Kurtosis:  397.68327048529085
Min:  2.0
Max:  200.0


In [169]:
# inflation adjustment
# use bea.gov
# note, 2012 dollars = 100
# for 1979, 33.804	34.728	35.590	36.451

realhrwage = dataframe['hrwage'] 
# if month between 1 and 3, then multiply by 100/33.804
realhrwage = np.where(dataframe['intmonth'] <= 3, realhrwage * 100/33.804, realhrwage)
# if month between 4 and 6, then multiply by 100/34.728
realhrwage = np.where((dataframe['intmonth'] > 3) & (dataframe['intmonth'] <= 6), realhrwage * 100/34.728, realhrwage)
# if month between 7 and 9, then multiply by 100/35.590
realhrwage = np.where((dataframe['intmonth'] > 6) & (dataframe['intmonth'] <= 9), realhrwage * 100/35.590, realhrwage)
# if month between 10 and 12, then multiply by 100/36.451
realhrwage = np.where((dataframe['intmonth'] > 9) & (dataframe['intmonth'] <= 12), realhrwage * 100/36.451, realhrwage)
dataframe['realhrwage'] = realhrwage
# remove values < 2 and > 250
dataframe.loc[dataframe['realhrwage'] < 2, 'realhrwage'] = np.nan
dataframe.loc[dataframe['realhrwage'] > 250, 'realhrwage'] = np.nan
summarize(realhrwage)

Num observations:  167565
Mean:  17.637541352577802
Median:  14.79114897645249
Standard Deviation:  12.070981730217532
Variance:  145.70859993124546
25th percentile:  10.287783599901237
50th percentile:  14.79114897645249
75th percentile:  21.947271679789306
Skewness:  20.96362617107457
Kurtosis:  1620.3729905674757
Min:  5.486817919947327
Max:  1371.7044799868315


In [170]:
summarize(dataframe['hrwage'])

Num observations:  167565
Mean:  6.199488560424233
Median:  5.15
Standard Deviation:  4.253624405209862
Variance:  18.093320580596956
25th percentile:  3.6
50th percentile:  5.15
75th percentile:  7.656493506493507
Skewness:  21.66955662890217
Kurtosis:  1741.0264243623174
Min:  2.0
Max:  500.0


In [171]:
lrealhrwage = np.log(dataframe['realhrwage'].to_numpy().astype(float))
dataframe['lrealhrwage'] = lrealhrwage
summarize(lrealhrwage)

Num observations:  167565
Mean:  2.7373189716718036
Median:  2.694028959739956
Standard Deviation:  0.49113434037747017
Variance:  0.24121294029801274
25th percentile:  2.3309571330498224
50th percentile:  2.694028959739956
75th percentile:  3.088642834747339
Skewness:  0.4548798600152436
Kurtosis:  -0.1659015167955582
Min:  1.7023484736274483
Max:  5.480840086431072


## regression!

In [172]:
dataframe.columns

Index(['minsamp', 'intmonth', 'hhid', 'state', 'smsarank', 'hhnum', 'activlwr',
       'hourslw', 'reasonlw', 'absentlw', 'classer', 'ind70', 'occ70',
       'lineno', 'relahh', 'age', 'marital', 'race', 'sex', 'veteran',
       'gradeat', 'gradecp', 'esr', 'weight', 'smsastat', 'centcity', 'ethnic',
       'ptstat', 'ftpt79', 'docc70', 'doinglw', 'hourslwa', 'uhours35',
       'why35lw', 'class', 'uhours', 'paidhr', 'earnhr', 'uearnwk', 'earnwt',
       'eligible', 'uhourse', 'paidhre', 'earnhre', 'earnwke', 'I25a', 'I25b',
       'I25c', 'I25d', 'uearnwke', 'year', 'smsa70', 'dind', 'hrwage',
       'hrearnwt', 'edyears', 'potexp', 'potexp2', 'realhrwage',
       'lrealhrwage'],
      dtype='object')

In [173]:
# want to regress lhrwage, edyears, potexp, potexp2, weighted by hrearnwt if white and not hispanic and working and male
condition = (dataframe['race'] == 1)  & (dataframe['esr'] == 1) & (dataframe['sex'] == 1)
# make sure no NaN values
condition = condition & (~np.isnan(lrealhrwage))
y = lrealhrwage[condition]
X = dataframe[['edyears', 'potexp', 'potexp2']][condition]
# make sure all are type float
y = y.astype(float)
X = X.astype(float)
w = dataframe['hrearnwt'][condition]
w = w.astype(float)

In [196]:
from importlib import reload
import helper
reload(helper)
from helper import run_WLS, summarize

In [192]:
run_WLS(X, y, w)

                            WLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.294
Model:                            WLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     8248.
Date:                Mon, 05 Feb 2024   Prob (F-statistic):               0.00
Time:                        15:55:06   Log-Likelihood:                -45709.
No. Observations:               78568   AIC:                         9.143e+04
Df Residuals:                   78564   BIC:                         9.146e+04
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.7521      0.009    204.529      0.0

In [244]:
# again, but use black men
# want to regress lhrwage, edyears, potexp, potexp2, weighted by hrearnwt if white and not hispanic and working and male
condition = (dataframe['race'] == 2)  & (dataframe['esr'] == 1) & (dataframe['sex'] == 1)
# make sure no NaN values
condition = condition & (~np.isnan(lrealhrwage))
y = lrealhrwage[condition]
X = dataframe[['edyears', 'potexp', 'potexp2']][condition]
# make sure all are type float
y = y.astype(float)
X = X.astype(float)
w = dataframe['hrearnwt'][condition]
w = w.astype(float)

In [245]:
run_WLS(X, y, w)

                            WLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.210
Model:                            WLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     529.9
Date:                Mon, 05 Feb 2024   Prob (F-statistic):          1.30e-309
Time:                        16:15:54   Log-Likelihood:                -3762.3
No. Observations:                6831   AIC:                             7533.
Df Residuals:                    6827   BIC:                             7560.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.7238      0.027     63.364      0.0

In [197]:
summarize(dataframe['sex']==1, weight=dataframe['hrearnwt'])

Num observations:  167565
Weighted Mean:  0.6042783800653752
Weighted Standard Deviation:  0.48900513233599213
Weighted Variance:  0.23912601945094117
25th percentile:  0.0
50th percentile:  1.0
75th percentile:  1.0
Skewness:  -0.23486364849810273
Kurtosis:  -1.9448622799846824
Min:  0.0
Max:  1.0


In [198]:
summarize(dataframe['sex']==2, weight=dataframe['hrearnwt'])

Num observations:  167565
Weighted Mean:  0.3957216199346248
Weighted Standard Deviation:  0.48900513233599213
Weighted Variance:  0.23912601945094117
25th percentile:  0.0
50th percentile:  0.0
75th percentile:  1.0
Skewness:  0.2348636484981022
Kurtosis:  -1.9448622799846829
Min:  0.0
Max:  1.0


In [200]:
summarize(dataframe['sex']==1, weight=dataframe['weight'])

Num observations:  167565
Weighted Mean:  0.5647431938726633
Weighted Standard Deviation:  0.4957905997970986
Weighted Variance:  0.24580831884716678
25th percentile:  0.0
50th percentile:  1.0
75th percentile:  1.0
Skewness:  -0.23486364849810273
Kurtosis:  -1.9448622799846824
Min:  0.0
Max:  1.0


In [201]:
summarize(dataframe['sex']==2, weight=dataframe['weight'])

Num observations:  167565
Weighted Mean:  0.43525680612733675
Weighted Standard Deviation:  0.4957905997970986
Weighted Variance:  0.24580831884716678
25th percentile:  0.0
50th percentile:  0.0
75th percentile:  1.0
Skewness:  0.2348636484981022
Kurtosis:  -1.9448622799846829
Min:  0.0
Max:  1.0


# Video 2 -- Oaxaca

In [231]:
from importlib import reload
import helper
reload(helper)
from helper import summarize

In [232]:
summarize(dataframe['race']==1, weight=dataframe['weight'])

Num observations:  167565
Weighted Mean:  0.879793997136449
Weighted Standard Deviation:  0.32520227511368816
Weighted Variance:  0.10575651973911893
1st percentile:  0.0
5th percentile:  0.0
10th percentile:  0.0
25th percentile:  1.0
50th percentile:  1.0
75th percentile:  1.0
90th percentile:  1.0
95th percentile:  1.0
99th percentile:  1.0
Skewness:  -2.3953892803156043
Kurtosis:  3.737934418848031
Min:  0.0
Max:  1.0


In [233]:
summarize(dataframe['race']==2, weight=dataframe['weight'])

Num observations:  167565
Weighted Mean:  0.10181166529271284
Weighted Standard Deviation:  0.30240047966734024
Weighted Variance:  0.09144605010303745
1st percentile:  0.0
5th percentile:  0.0
10th percentile:  0.0
25th percentile:  0.0
50th percentile:  0.0
75th percentile:  0.0
90th percentile:  0.0
95th percentile:  1.0
99th percentile:  1.0
Skewness:  2.874258128238122
Kurtosis:  6.2614345219963266
Min:  0.0
Max:  1.0


In [234]:
summarize(dataframe['realhrwage'])

Num observations:  164086
Mean:  17.576591154407158
Median:  14.79114897645249
Standard Deviation:  10.293266115658863
Variance:  105.95132732777088
1st percentile:  6.447011055938108
5th percentile:  8.058763819922635
10th percentile:  8.429334082607474
25th percentile:  10.287783599901237
50th percentile:  14.79114897645249
75th percentile:  21.947271679789306
90th percentile:  29.58229795290498
95th percentile:  35.99401059663672
99th percentile:  54.813311020273794
Skewness:  3.1179083939532886
Kurtosis:  27.967125357348458
Min:  5.486817919947327
Max:  240.04828399769553


In [238]:
condition = (dataframe['race'] == 1) & (dataframe['sex'] == 1) & (dataframe['esr'] == 1) & (dataframe['age'] >= 24) & (dataframe['age'] <= 65)
summarize(dataframe['lrealhrwage'][condition])

Num observations:  61540
Mean:  3.028746679256125
Median:  3.0480085070082548
Standard Deviation:  0.45060254696648155
Variance:  0.2030426553326802
1st percentile:  2.0327638890544177
5th percentile:  2.2568809180881058
10th percentile:  2.4244714112336667
25th percentile:  2.7160078664587313
50th percentile:  3.0480085070082548
75th percentile:  3.319771151802933
90th percentile:  3.5833525522990546
95th percentile:  3.765674109093009
99th percentile:  4.109293581945737
Skewness:  0.02751180271187005
Kurtosis:  0.10395236145023246
Min:  1.7023484736274483
Max:  5.466617681979737


In [262]:
# again, but use black men
# want to regress lhrwage, edyears, potexp, potexp2, weighted by hrearnwt if white and not hispanic and working and male
condition = (dataframe['race'] == 1)  & (dataframe['sex'] == 1) & (dataframe['age'] >= 24) & (dataframe['age'] <= 65)
# make sure no NaN values
condition = condition & (~np.isnan(lrealhrwage))
y1 = lrealhrwage[condition]
X1 = dataframe[['edyears', 'potexp', 'potexp2']][condition]
# make sure all are type float
y1 = y1.astype(float)
X1 = X1.astype(float)
w1 = dataframe['hrearnwt'][condition]
w1 = w1.astype(float)

for col in X1.columns:
    print(col)
    summarize(X1[col], weight=w1)
    print('-----')

edyears
Num observations:  65144
Weighted Mean:  12.80134001622942
Weighted Standard Deviation:  3.0945890249742622
Weighted Variance:  9.576481233491155
1st percentile:  4.0
5th percentile:  8.0
10th percentile:  9.0
25th percentile:  12.0
50th percentile:  12.0
75th percentile:  15.0
90th percentile:  17.0
95th percentile:  18.0
99th percentile:  18.0
Skewness:  -0.49730677262491146
Kurtosis:  0.9161553232364086
Min:  0.0
Max:  18.0
-----
potexp
Num observations:  65144
Weighted Mean:  21.210349500842867
Weighted Standard Deviation:  12.37403402971353
Weighted Variance:  153.11671816850844
1st percentile:  2.0
5th percentile:  5.0
10th percentile:  7.0
25th percentile:  10.0
50th percentile:  19.0
75th percentile:  32.0
90th percentile:  40.0
95th percentile:  43.0
99th percentile:  48.0
Skewness:  0.40011658333380384
Kurtosis:  -0.9507567921697535
Min:  0.0
Max:  59.0
-----
potexp2
Num observations:  65144
Weighted Mean:  602.9956441164136
Weighted Standard Deviation:  605.932343787

In [263]:
run_WLS(X1, y1, w1)

                            WLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.168
Model:                            WLS   Adj. R-squared:                  0.168
Method:                 Least Squares   F-statistic:                     3153.
Date:                Mon, 05 Feb 2024   Prob (F-statistic):               0.00
Time:                        16:33:12   Log-Likelihood:                -38164.
No. Observations:               65144   AIC:                         7.634e+04
Df Residuals:                   65140   BIC:                         7.637e+04
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.9126      0.011    166.723      0.0

In [264]:
# again, but use black men
# want to regress lhrwage, edyears, potexp, potexp2, weighted by hrearnwt if white and not hispanic and working and male
condition = (dataframe['race'] == 2)  & (dataframe['sex'] == 1) & (dataframe['age'] >= 24) & (dataframe['age'] <= 65)
# make sure no NaN values
condition = condition & (~np.isnan(lrealhrwage))
y2 = lrealhrwage[condition]
X2 = dataframe[['edyears', 'potexp', 'potexp2']][condition]
# make sure all are type float
y2 = y2.astype(float)
X2 = X2.astype(float)
w2 = dataframe['hrearnwt'][condition]
w2 = w2.astype(float)

In [265]:
run_WLS(X2, y2, w2)

                            WLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.139
Model:                            WLS   Adj. R-squared:                  0.139
Method:                 Least Squares   F-statistic:                     256.3
Date:                Mon, 05 Feb 2024   Prob (F-statistic):          4.38e-156
Time:                        16:33:23   Log-Likelihood:                -3242.3
No. Observations:                5746   AIC:                             6493.
Df Residuals:                    5742   BIC:                             6519.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.9498      0.035     55.137      0.0

In [277]:
from importlib import reload
import helper
reload(helper)
from helper import oaxaca_blinder

In [278]:
# oaxaca decomposition
oaxaca_blinder(X1, X2, y1, y2, w1,w2)

Mean diff:  const       0.000000
edyears     1.496042
potexp     -1.474405
potexp2   -78.681763
dtype: float64
Coeff1:  [ 1.91264202e+00  5.77945396e-02  3.21556552e-02 -5.09025686e-04]
Coeff2:  [ 1.94977648e+00  5.61089405e-02  1.47280559e-02 -1.83825662e-04]
Explained component: 0.07910363078735927
Unexplained component: 0.15509659997572392
Total gap: 0.2342002307630832
