# Dataset & Summary Statistics

First, write functions that create the dataset for analysis and visualizations.

In [5]:
import os
import pandas as pd
import numpy as np

In [None]:
# Create dataset 
fname = r'C:\Users\15853\Desktop\repos\gorman-earlyjobskill-analysis\data\output\original.pkl'
fname2 = r'C:\Users\15853\Desktop\repos\gorman-earlyjobskill-analysis\data\input\TNFI_TRUNC_79.csv'
if not os.path.exists(fname):
    cwd = os.getcwd()
    os.chdir('../../data/')
    subprocess.check_call('./create_data')
    os.chdir(cwd)

In [None]:
OBS_DATASET = pd.read_pickle(fname)
SURVEY_YEARS = OBS_DATASET['SURVEY_YEAR'].unique()
TNFI_79 = pd.read_csv(fname2)
TNFI_79['SURVEY_YEAR'] = 1978
OBS_DATASET = pd.merge(OBS_DATASET, TNFI_79, how='left', left_on=['IDENTIFIER', 'SURVEY_YEAR'],
                       right_on=['IDENTIFIER', 'SURVEY_YEAR'])

In [None]:
# This function returns the observed dataset.
def get_dataset():

    # Add a crude measure on an individual's age. This is not the most accurate as the times
    # of the interview are spread all over the year.
    OBS_DATASET['AGE'] = OBS_DATASET['SURVEY_YEAR'] - OBS_DATASET['YEAR_OF_BIRTH']
    
    # Construct a variable for family income quartile
    trunc_data = OBS_DATASET.loc[OBS_DATASET['SURVEY_YEAR'] == 1978, ['TNFI_TRUNC']].dropna()
    first_q = np.percentile(trunc_data, 25)
    second_q = np.percentile(trunc_data, 50)
    third_q = np.percentile(trunc_data, 75)

    OBS_DATASET['FAMILY_INCOME_QUARTILE'] = np.nan

    def func(x):
        if 'NaN' != x < first_q:
            return 'first quartile'
        elif first_q <= x < second_q:
            return 'second quartile'
        elif second_q <= x < third_q:
            return 'third quartile'
        elif third_q <= x != 'NaN':
            return 'fourth quartile'

    OBS_DATASET['FAMILY_INCOME_QUARTILE'] = OBS_DATASET['TNFI_TRUNC'].apply(func)

    return OBS_DATASET

In [None]:
# View the dataset by the first survey respondent in each year. 
df = pd.read_pickle(fname)
SURVEY_YEAR = df['SURVEY_YEAR'].unique()
df.head(5)

In [None]:
# Show all variables 
list(df.columns)

Next, write a function for nice / uniform formatting across plots.

In [None]:
def set_formatter(ax):
    formatter = mplib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))
    ax.get_yaxis().set_major_formatter(formatter)

In [None]:
# View the number of observations in the survey over time
num_obs = []
for year in SURVEY_YEARS:
    cond = df.loc[df['SURVEY_YEAR'] == year, 'IS_INTERVIEWED'].isin([True])
    num_obs += [df.loc[df['SURVEY_YEAR'] == year, 'IDENTIFIER'][cond].count()]

In [None]:
ax = plt.figure().add_subplot(111)
set_formatter(ax)

ax.bar(df['SURVEY_YEAR'].unique(), num_obs)

ax.set_ylabel('Observations')
ax.set_xlabel('Year')

plt.savefig('fig-dataset-basic-observations.png')

* The NLSY79 is a nationally representative sample of 12,686 individuals first surveyed in 1979 between the ages of 14 and 22. It makes sense that the sample would get smaller over the course of time as people drop out, stop responding, die, etc. In 1994, the survey was moved to be administered every other year, which is visually represented in the bars spaced further out after that time.  

In [None]:
# Plot the years of birth among the sample; removing years of birth with a very small
# number of individuals for clarity of the visualization 
dat = df.loc[df['SURVEY_YEAR'] == 1978, 'YEAR_OF_BIRTH']
dat = dat.value_counts().to_dict()

for year in [1955, 1956, 1965]:
    del dat[year]

ax = plt.figure().add_subplot(111)
set_formatter(ax)

ax.set_ylabel('Observations')
ax.set_xlabel('Year of Birth')

ax.bar(dat.keys(), dat.values())

plt.savefig('fig-dataset-basic-birth.png')

* The sample is well distributed across ages initially interviewed between 14 and 22, with fewer of the youngest individuals. 

In [None]:
# View the number of respondents in the labor force over time 

In [None]:
emp_stat = df.loc[df['SURVEY_YEAR'] == 1978, 'EMP_STATUS_WK_1'].unique()
emp_stat