First, read in Guvenen's original data file:

In [1]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt; import statsmodels.formula.api as sm
%matplotlib inline

data = pd.read_stata('/Users/tew207/Desktop/RED_ACCEPTED_FINAL_DATA_CODE/ready_newdata.dta')

Some preliminary defintions:

In [2]:
tinit = 67; tlast = 96
ageinit = 20; agelast = 64
agecell = 4
minyrs = 20
nlag = 29
agelb = 19; ageub = agelb + agecell
agemidpt = (agelb+ageub)/2  # = 21
agemax = (agelast + agelast - agecell)/2 - (ageub+agelb)/2  # = 41 (maximum age a cohort can reach)
oldcoh = agelast - minyrs - ageinit   # number of cohort existing in the first year
newcoh = tlast - (minyrs-1) - (tinit+1)  # number of cohorts entering after first year
maxcoh = oldcoh + newcoh   # total number of cohorts

Some changes to the variables containing information about educational achievement (basically interpolating grades in 1969, 1970, 1971, 1972, 1973 and 1974):

In [3]:
for i in range(94,99):
    data.rename(columns={"upedu"+str(i)+"h": "grade"+str(i)}, inplace=True)

data["grade72"] = data["edcn72"]
data.loc[data["grade72"]>25, "grade72"] = np.nan # take out values of grade above 25
data["grade75"] = data["edcn75"]
data.loc[data["grade75"]>25, "grade75"] = np.nan

data["grade69"] = 0
data.loc[(data.seqno68==1) & (data.seqno69==1), "grade69"] = data[(data.seqno68==1) & (data.seqno69==1)].grade68
cond = (data.seqno72==1) & (data.seqno69==1) & (data.grade69==0)
data.loc[cond, "grade69"] = data[cond].grade72

data.loc[(data.seqno68==1) & (data.seqno70==1), "grade70"] = data[(data.seqno68==1) & (data.seqno69==1)].grade68
cond = (data.seqno70==1) & (data.seqno72==1) & (data.grade69==0)
data.loc[cond, "grade70"] = data[cond].grade72

data.loc[(data.seqno71==1) & (data.seqno70==1), "grade71"] = data[(data.seqno71==1) & (data.seqno70==1)].grade70
cond = (data.seqno71==1) & (data.seqno72==1) & (data.grade71==0)
data.loc[cond, "grade71"] = data[cond].grade72

data.loc[(data.seqno73==1) & (data.seqno72==1), "grade73"] = data[(data.seqno73==1) & (data.seqno72==1)].grade72
cond = (data.grade73==0) & (data.seqno73==1) & (data.seqno75==1)
data.loc[cond, "grade73"] = data[cond].grade75

data.loc[(data.seqno74==1) & (data.seqno72==1), "grade74"] = data[(data.seqno74==1) & (data.seqno72==1)].grade72
cond = (data.grade74==0) & (data.seqno74==1) & (data.seqno75==1)
data.loc[cond, "grade74"] = data[cond].grade75

for i in range(68, 98):
    data.loc[data["grade"+str(i)]>30, "grade"+str(i)] = np.nan # take out grades above 30

First define a dictionary containing the relative real wages for years 1967 to 1996 (recall that in the PSID, household income refers to the year prior to the survey, i.e. the survey in 1968 asked about income in the year 1967), then replace all with missing all observations which:

1. Head real wage less than \$ 2 or more than \$ 520 in 1996 prices
2. Head hours less than 520 or more than 5096
3. Positive hours and no income or positive income and no hours

Then, create a college dummy, using the direct measure (`hdedcnXX` are the variables `V313` ff., whose value is 7 or 8 if college was completed) or proxying with more than 15 years of education for post-1990, when the direct measure is not available.

In [4]:
awg = {"67":2.85, "68":3.02, "69":3.22, "70":3.40, "71":3.63, "72":3.90,
       "73":4.14, "74":4.43, "75":4.73, "76":5.06, "77":5.44, "78":5.87,
       "79":6.33, "80":6.84, "81":7.43, "82":7.86, "83":8.19, "84":8.48,
       "85":8.73, "86":8.92, "87":9.13, "88":9.43, "89":9.80, "90":10.19,
       "91":10.50, "92":10.76, "93":11.03, "94":11.32, "95":11.64, "96":12.03}

for i in range(67,97):
    data["rawg"+str(i)] = awg[str(i)]/data["prc"+str(i)]
    cond1 = (data["rhdwg"+str(i)] <= 2*data["rawg"+str(i)]/awg["96"]) | (data["rhdwg"+str(i)] > 400*data["rawg"+str(i)]/awg["96"]) 
    cond2 = (data["hwkhrs"+str(i)] > 5096) | (data["hwkhrs"+str(i)] < 520)
    cond3 = (data["rhdlbin"+str(i)] == 0) & (data["hwkhrs"+str(i)] > 0)
    cond4 = (data["rhdlbin"+str(i)] > 0) & (data["hwkhrs"+str(i)] == 0)
    data.loc[np.logical_or.reduce((cond1, cond2, cond3, cond4)), "rhdlbin"+str(i)] = np.nan 
    
"""

Only needed for analysis by education level: generate college dummies

for i in range(68,91): # dummy variable coldum68-90, 1 if hdedcnXX is 7 or 8
    data["coldum"+str(i)] = 0
    data.loc[(data["hdedcn"+str(i)]==7) | (data["hdedcn"+str(i)]==8), "coldum"+str(i)] = 1
    
for i in range(91,98): # dummy variable coldum91-98, 1 if gradeXX is >16
    data["coldum"+str(i)] = 0
    data.loc[(data["grade"+str(i)] >= 16) & (np.isfinite(data["grade"+str(i)])), "coldum"+str(i)] = 1
""";

Then, create dummies for:

1. Age between 20 and 64 inclusive
2. Individual is head
2. Sex male
3. Labour income positive

and record the number of years for which all four dummies are equal to one in a new variable `kept`.
Remove all observations with less than `minyrs` years for which all dummies are one.

In [5]:
data["kept"] = 0

for i in range(68,98):
    ii = i-1
    # Dummy for 19<age<65
    data["dum_age"+str(i)] = 0 
    data.loc[(data["agehd"+str(i)] >= ageinit) & (data["agehd"+str(i)] <= agelast), "dum_age"+str(i)] = 1  
    # Dummy for head of household
    data["dum_seq"+str(i)] = 0 
    data.loc[data["seqno"+str(i)]==1, "dum_seq"+str(i)] = 1 
    # Dummy for sex of head 
    data["dum_sex"+str(i)] = 0 
    data.loc[data["sexhd"+str(i)]==1 ,"dum_sex"+str(i)] = 1  
    # Dummy for positive labour income
    data["dum_lab"+str(ii)] = 0 
    data.loc[data["rhdlbin"+str(ii)]>0, "dum_lab"+str(ii)] = 1
    # Dummy for agedum*headdum*sexdum*incdum 
    data.kept += data["dum_age"+str(i)]*data["dum_seq"+str(i)]*data["dum_sex"+str(i)]*data["dum_lab"+str(ii)]
    # Generate log income
    data["logrinc"+str(ii)] = np.log(data["rhdlbin"+str(ii)])
    # Experience is age - education (up to 12 years) - 6 
    data["edu_capped"] = data["grade"+str(i)].fillna(0)
    data.loc[data["edu_capped"]<12, "edu_capped"] = 12
    data["expr"+str(i)] = data["agehd"+str(i)] - data["edu_capped"] - 6
    
print "There are", sum(data.kept>=minyrs), "individuals with at least", minyrs, "years of valid observations"
data = data[data.kept>=minyrs]

data["unidno"] = range(1,data.shape[0]+1)
data["k"] = data.shape[0]

There are 1270 individuals with at least 20 years of valid observations


Drop some variables and rename those variables that might create problems when identifying stub names in wide-to-long conversion:

In [6]:
to_drop = ["educ", "edcn", "prc", "rawg", "awg", "upedu", "_merge", "edu_capped", "yrdum"]
drop_all = []

for d in to_drop:
    for colname in data.columns:
        if colname[0:len(d)]==d:
            drop_all.append(colname)

data.drop(drop_all, axis=1, inplace=True)

data.rename({"age": "indage", "sex":"indsex"}, inplace=True)
for i in range(67,100):
    try:
        data.rename(columns={"agehd"+str(i) : "hdage"+str(i)}, inplace=True)
    except:
        pass
    try:
        data.rename(columns={"hdwg"+str(i) : "nhdwg"+str(i)}, inplace=True)
    except:
        pass

Reshape the data set from wide to long format; create squares, cubes and quadruples of experience, drop observations outside first and last year of analysis and create year dummies:

In [7]:
data_long = pd.wide_to_long(data, ['age', 'hdage', 'dum_age', 'expr', 'hdedcn', 'hdlbin', 'nhdwg', 'rhdwg', 'grade', 
                                   'hwkhrs', 'id', 'dum_lab', 'rhdlbin', 'logrinc', 'numfam', 'relh', 'dum_seq', 
                                   'seqno', 'dum_sex', 'sexhd'], i="unidno", j="year")

data_long.reset_index(inplace=True)
data_long["year"] = data_long.year.astype(int)

# create squared, cubed, quadrupled experience variables
data_long["agehdsq"] = data_long["expr"]**2/100
data_long["agehdcu"] = data_long["expr"]**3/1000
data_long["agehdqr"] = data_long["expr"]**4/10000

# drop sample outside initial/last year range
print sum(data_long.year<=tinit), "observations below year", tinit, "dropped"
print sum(data_long.year>tlast), "observations above year", tlast, "dropped"
data_long = data_long[(data_long.year>tinit)&(data_long.year<=tlast)]

# Create year dummies, run regression
data = pd.concat([data_long, pd.get_dummies(data_long["year"], prefix="yrdum")], axis=1)
result = sm.ols(formula = "logrinc ~ hdage + agehdsq +"+"+".join(["yrdum_"+str(i) for i in range(68,96)]), data=data).fit()
data = data[[col for col in data.columns if col[:5] != 'yrdum']]

1270 observations below year 67 dropped
3810 observations above year 96 dropped


For each year from 1968 to 1996, fit the regression:
$$
y_t = \beta_0 + \beta_1 age_t + \beta_2 expr^2_t + \beta_3 expr^3_t + \varepsilon_t
$$
and record the resulting contants, coefficients and residuals:

In [8]:
newcols = ["alphaage", "alphagesq", "alphagecu", "alphacons", "residual"]
data = pd.concat([data, pd.DataFrame(index=data.index, columns=newcols, dtype=float)], axis=1)

for i in range(tinit+1,tlast+1):
    cond = (data["year"]==i) & (data.seqno*data.dum_lab*data.dum_age*data.dum_sex==1)
    # fit regression
    result = sm.ols("logrinc ~ hdage + agehdsq + agehdcu", data=data[cond]).fit()
    # save coefficients
    data.loc[cond, ["alphaage", "alphagesq", "alphagecu", "alphacons"]] = list(result.params)
    # save residuals
    data.loc[cond, "residual"] = result.resid

In [9]:
# Pre-allocate columns for speed
data["combined"] = data.seqno*data.dum_lab*data.dum_sex*data.dum_age
newcols = ["resid"+str(i)+"_"+str(j) for i in range(1,agemax+1) for j in range(1,tlast-tinit+2)]
data = pd.concat([data, pd.DataFrame(index=data.index, columns=newcols, dtype=float)], axis=1)
newcols = ["resid"+str(i)+"f"+str(j) for i in range(1,agemax+1) for j in range(1,tlast-tinit+2)]
data = pd.concat([data, pd.DataFrame(index=data.index, columns=newcols, dtype=float)], axis=1)

j = ageub  # Cohort age upper bound
m = 0

for i in range(agelb+1,agelast-agecell+1):
    j += 1
    m += 1 
    n = 0
    t = 0
    for k in range(tinit+1,tlast+1):
        t += 1
        cond = (data.combined==1) & (data.hdage.isin(range(i,j+1))) & (data.year==k)
        data.loc[cond, "resid"+str(m)+"_"+str(t)] = data[cond].residual
        data.loc[:, "resid"+str(m)+"f"+str(t)] = data["resid"+str(m)+"_"+str(t)].shift(-n)
        #data.drop("resid"+str(m)+str(t), axis=1)
        n += 1    

In [10]:
data.sort(columns=["unidno", "year"], inplace=True)
data.index = range(1, len(data)+1)

In [37]:
data.loc[~np.isnan(data.resid12_1), ["unidno", "year", "residual", "resid12_1", "resid12_2", "resid12f1", "resid12f2"]]

Unnamed: 0,unidno,year,residual,resid12_1,resid12_2,resid12f1,resid12f2
10412,360,68,-0.52864,-0.52864,,-0.52864,
10760,372,68,0.765665,0.765665,,0.765665,
10847,375,68,-0.13332,-0.13332,,-0.13332,
10934,378,68,0.259608,0.259608,,0.259608,
10992,380,68,0.170992,0.170992,,0.170992,
11282,390,68,0.564258,0.564258,,0.564258,
11543,399,68,-0.718674,-0.718674,,-0.718674,
11804,408,68,-0.163354,-0.163354,,-0.163354,
11949,413,68,0.22119,0.22119,,0.22119,
12094,418,68,0.315043,0.315043,,0.315043,


In [35]:
data.resid12f2.describe()

count    127.000000
mean       0.005381
std        0.424028
min       -1.144452
25%       -0.308309
50%        0.027956
75%        0.291979
max        1.134756
Name: resid12f2, dtype: float64

In [11]:
Cov = np.zeros((maxcoh, agemax, agemax), float)
N = np.zeros((maxcoh, agemax, agemax), float)

for time in range(1, tlast-tinit+1):
    for age in range(1, agemax+1):
        c = min(age,time,nlag)
        for k in range(1,c+1):
            m = c - k
            l = age - m
            x = time - m
            cohort = age - time + newcoh + 1
            obs = min(data["resid"+str(age)+"f"+str(time)].count(), data["resid"+str(l)+"f"+str(x)].count())
            if (obs>10 and cohort in range(1, maxcoh+1)):
                cov = np.cov(data["resid"+str(age)+"f"+str(time)], data["resid"+str(l)+"f"+str(time)])
                Cov[cohort-1, age-1, l-1] = cov[0,1]
                N[cohort-1, age-1, l-1] = obs