In [1]:
import pyreadstat as prs
from helper import *
import os

In [2]:
files = ['data/morg79.dta','data/morg19.dta' ]
# for each year, log hourly wage, % < high school, % with college, years edu, potential exp, potential exp squared, sample size
# do this for 3 criteria: men and women age 24-65, men age 24-65, women age 24-65
# weight based on hrearnwt = dataframe['earnwt'] * dataframe['uhours']

# define dictionary of dataframes
dfs = {}

for file in files:
    print(file)
    # create dataframe
    df, meta = prs.read_dta(file)
    # create year variable
    # split .dta, take last two characters, convert to int
    year = int(file.split('.')[0][-2:])
    dfs[year] = df

print(dfs.keys())

data/morg79.dta
data/morg19.dta
dict_keys([79, 19])


In [3]:
inflation = [[33.804,34.728,35.590,36.451], [102.877, 103.422, 103.674, 104.080]]
inflation_2019 = [102.877, 103.422, 103.674, 104.080]
# now that data is loaded in dictionary, can create tables
# add to each df the new variables
for j, year in enumerate(dfs.keys()):
    df = dfs[year]
    # create new variables

    # make sure to compare the right row entries so the division makes sense
    if 'uhours' in df.columns:
        # drop all values =< 0
        df.loc[df['uhours'] <= 0, 'uhours'] = np.nan
        df['hrwage'] = df['earnwke'] / df['uhours']
        dfs[year]['hrearnwt'] = dfs[year]['earnwt'] * dfs[year]['uhours']
    elif 'uhourse' in df.columns:
        # drop all values =< 0
        df.loc[df['uhourse'] <= 0, 'uhourse'] = np.nan
        df['hrwage'] = df['earnwke'] / df['uhourse']
        dfs[year]['hrearnwt'] = dfs[year]['earnwt'] * dfs[year]['uhourse']
    
    ## first adjust for inflation for each quarter ##
    # get the row from the inflation list
    inflation_row = inflation[j]
    realhrwage = df['hrwage'] 
    # if month between 1 and 3, then multiply by 100/33.804
    realhrwage = np.where(df['intmonth'] <= 3, realhrwage * 100/inflation_row[0], realhrwage)
    # if month between 4 and 6, then multiply by 100/34.728
    realhrwage = np.where((df['intmonth'] > 3) & (df['intmonth'] <= 6), realhrwage * inflation_2019[0]/inflation_row[1], realhrwage)
    # if month between 7 and 9, then multiply by 100/35.590
    realhrwage = np.where((df['intmonth'] > 6) & (df['intmonth'] <= 9), realhrwage * inflation_2019[1]/inflation_row[2], realhrwage)
    # if month between 10 and 12, then multiply by 100/36.451
    realhrwage = np.where((df['intmonth'] > 9) & (df['intmonth'] <= 12), realhrwage * inflation_2019[2]/inflation_row[3], realhrwage)
    
    df['realhrwage'] = realhrwage
    # remove values < 2 and > 250
    df.loc[df['realhrwage'] < 2, 'realhrwage'] = np.nan
    df.loc[df['realhrwage'] > 250, 'realhrwage'] = np.nan


    ## get educ; treat gradeat vs grade92 separately ##
    # if grade32 exists, then need to convert that; otheerwise have to deal with gradeat and gradecp
    # create educ variable
    if 'grade92' in df.columns:
        df['educ'] = df['grade92']
        
        df.loc[df['grade92'] == 32, 'educ'] = 2.5
        df.loc[df['grade92'] == 33, 'educ'] = 5.5
        df.loc[df['grade92'] == 34, 'educ'] = 7.5
        df.loc[df['grade92'] == 35, 'educ'] = 9
        df.loc[df['grade92'] == 36, 'educ'] = 10
        df.loc[df['grade92'] == 37, 'educ'] = 11
        df.loc[df['grade92'] == 38, 'educ'] = 12
        df.loc[df['grade92'] == 39, 'educ'] = 12
        df.loc[df['grade92'] == 40, 'educ'] = 13
        df.loc[df['grade92'] == 41, 'educ'] = 14
        df.loc[df['grade92'] == 42, 'educ'] = 14
        df.loc[df['grade92'] == 43, 'educ'] = 16
        df.loc[df['grade92'] == 44, 'educ'] = 18
        df.loc[df['grade92'] == 45, 'educ'] = 18
        df.loc[df['grade92'] == 46, 'educ'] = 18

    else:
        df['educ'] = df['gradeat']
        # if gradecp is 0, then subtract 1
        df.loc[df['gradecp'] == 0, 'educ'] = df['educ'] - 1
    

    logwage = dfs[year]['realhrwage'].to_numpy()
    # conver to float
    logwage = logwage.astype(float)
    df['logwage'] = np.log(logwage)
    df['lths'] = dfs[year]['educ'] < 12
    df['college'] = dfs[year]['educ'] >= 16
    df['exp'] = dfs[year]['age'] - dfs[year]['educ'] - 6
    df['exp2'] = dfs[year]['exp']**2

    # reset key value
    dfs[year] = df

  df.loc[df['grade92'] == 32, 'educ'] = 2.5


In [4]:
combined_df = pd.concat([dfs[79], dfs[19]], ignore_index=True)
fulltime = combined_df['uhourse'] >= 35
combined_df['fulltime'] = fulltime

In [5]:
combined_df

Unnamed: 0,minsamp,intmonth,hhid,state,smsarank,hhnum,activlwr,hourslw,reasonlw,absentlw,...,ym,ch02,ch35,ch613,ch1417,ch05,ihigrdc,docc00,dind02,fulltime
0,4,1,003001221503,93.0,0.0,1,1.0,40,,,...,,,,,,,,,,True
1,4,1,003001221503,93.0,0.0,1,1.0,40,,,...,,,,,,,,,,True
2,4,1,003001221503,93.0,0.0,1,5.0,,,,...,,,,,,,,,,False
3,4,1,003001222503,93.0,0.0,1,7.0,,,,...,,,,,,,,,,False
4,4,1,003001222503,93.0,0.0,1,1.0,40,,,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619791,4,12,802505349610555,,,1,,32,,,...,716.0,0.0,0.0,0.0,0.0,0.0,12.0,21,47,False
619792,4,12,802505349610555,,,1,,40,,,...,716.0,0.0,0.0,0.0,0.0,0.0,,20,21,True
619793,4,12,876944601471509,,,1,,,,,...,716.0,0.0,0.0,0.0,0.0,0.0,14.0,22,23,False
619794,4,12,905527030610215,,,1,,40,,,...,716.0,0.0,0.0,0.0,0.0,0.0,13.0,19,4,True


In [62]:
# X_cols = ['bin_age', 'bin_educ'] # make into bins and leave one out
X_cols = ['age', 'educ']
y_col = 'sex'
w_col = 'hrearnwt'
X, y, w, df_cond = prepare_data(combined_df, None, X_cols, y_col, w_col, return_dataframe=True)
y = np.where(y == 2, 0, 1) # male is 1, female is 0. # whatever is 0 gets reweighted

In [63]:
w_new, Psi_x = run_DFL(X, y, w)

Weighted Logit Model Summary (GLM):
                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:               332520
Model:                            GLM   Df Residuals:                   332518
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.4227e+10
Date:                Tue, 20 Feb 2024   Deviance:                   1.4845e+11
Time:                        22:34:23   Pearson chi2:                 1.09e+11
No. Iterations:                     4   Pseudo R-squ. (CS):        -1.336e+210
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
age            0

In [65]:
summarize(w_new)

Num observations:  332520
Mean:  323486.1453285388
Median:  274558.013221396
Standard Deviation:  223641.23733051747
Variance:  50015403034.724846
1st percentile:  20164.151436
5th percentile:  49794.90498670141
10th percentile:  73571.12158203125
25th percentile:  158380.46249017422
50th percentile:  274558.013221396
75th percentile:  451973.76025752164
90th percentile:  640167.2808269913
95th percentile:  741634.6358157528
99th percentile:  985061.9370700333
Skewness:  1.179650664769934
Kurtosis:  2.2124367718215017
Min:  441.19998931884766
Max:  3120656.1659999997


In [66]:
summarize(w)

Num observations:  332520
Mean:  326654.7394281741
Median:  278833.76187499997
Standard Deviation:  223903.64148584436
Variance:  50132840670.62152
1st percentile:  20989.733999999997
5th percentile:  50881.1611328125
10th percentile:  75213.7311
25th percentile:  161668.3544921875
50th percentile:  278833.76187499997
75th percentile:  454258.7585
90th percentile:  642293.836
95th percentile:  745412.590565
99th percentile:  994555.22816
Skewness:  1.1859598233244584
Kurtosis:  2.3016476657954708
Min:  441.19998931884766
Max:  3120656.1659999997


In [67]:
combined_df['dfl_weight'] = w_new

In [69]:
combined_df['dfl_weight']

0         282229.609375
1         254033.209190
2                   NaN
3                   NaN
4         287004.242585
              ...      
619791     51239.672965
619792     58052.112000
619793              NaN
619794     61429.452000
619795     53214.859608
Name: dfl_weight, Length: 619796, dtype: float64

In [70]:
X, y, w = prepare_data(combined_df, (combined_df['sex']==2), ['educ'], 'realhrwage', 'hrearnwt')

In [71]:
summarize(y, w)

Num observations:  145604
Weighted Mean:  20.334743399277826
Weighted Standard Deviation:  13.233246699352737
Weighted Variance:  175.1188182059301
1st percentile:  4.35889294745715
5th percentile:  7.731162104288442
10th percentile:  8.545997186394771
25th percentile:  10.303384265067702
50th percentile:  14.580518483237265
75th percentile:  21.65456689524808
90th percentile:  33.518736548808604
95th percentile:  44.14473168995438
99th percentile:  70.09851570321841
Skewness:  2.9952705315934893
Kurtosis:  19.241823476994632
Min:  2.015089993633891
Max:  249.02478862413528


In [72]:
X, y, w = prepare_data(combined_df, (combined_df['sex']==2), ['educ'], 'realhrwage', 'dfl_weight')

In [73]:
summarize(y, w)

Num observations:  145604
Weighted Mean:  20.57085816897337
Weighted Standard Deviation:  13.365197701943664
Weighted Variance:  178.6285096120402
1st percentile:  4.35889294745715
5th percentile:  7.731162104288442
10th percentile:  8.545997186394771
25th percentile:  10.303384265067702
50th percentile:  14.580518483237265
75th percentile:  21.65456689524808
90th percentile:  33.518736548808604
95th percentile:  44.14473168995438
99th percentile:  70.09851570321841
Skewness:  2.9952705315934893
Kurtosis:  19.241823476994632
Min:  2.015089993633891
Max:  249.02478862413528


In [74]:
X, y, w = prepare_data(combined_df, combined_df['sex']==1, ['educ'], 'realhrwage', 'hrearnwt')

In [75]:
summarize(y, w)

Num observations:  167288
Weighted Mean:  25.08356929912618
Weighted Standard Deviation:  14.797965111830939
Weighted Variance:  218.97977145096564
1st percentile:  5.916459590580996
5th percentile:  8.578866406342444
10th percentile:  9.72034565549151
25th percentile:  13.886208079273587
50th percentile:  20.22543505893757
75th percentile:  29.277230963753855
90th percentile:  43.022086693353444
95th percentile:  54.60343507195956
99th percentile:  71.8339395513067
Skewness:  2.1386773308054305
Kurtosis:  11.267729467803589
Min:  2.013543601127318
Max:  249.39232594478847


In [80]:
X, y, w = prepare_data(combined_df, combined_df['sex']==1, ['educ'], 'realhrwage', 'dfl_weight')

In [81]:
summarize(y, w)

Num observations:  167288
Weighted Mean:  25.08356929912618
Weighted Standard Deviation:  14.797965111830939
Weighted Variance:  218.97977145096564
1st percentile:  5.916459590580996
5th percentile:  8.578866406342444
10th percentile:  9.72034565549151
25th percentile:  13.886208079273587
50th percentile:  20.22543505893757
75th percentile:  29.277230963753855
90th percentile:  43.022086693353444
95th percentile:  54.60343507195956
99th percentile:  71.8339395513067
Skewness:  2.1386773308054305
Kurtosis:  11.267729467803589
Min:  2.013543601127318
Max:  249.39232594478847
