In [None]:
####################################################################
# Prepared for Gabor's Data Analysis
#
# Data Analysis for Business, Economics, and Policy
# by Gabor Bekes and  Gabor Kezdi
# Cambridge University Press 2021
#
# gabors-data-analysis.com
#
# License: Free to share, modify and use for educational purposes.
# 	Not to be used for commercial purposes.
#
####################################################################

####################################################################
# cps-earnings dataset
#
# input:
#       morg2014.csv

# output:
#       morg-2014-emp.csv

# version 1.0   2021-05-23
####################################################################

In [None]:
### SETTING UP DIRECTORIES

# import packages
import pandas as pd
import os
import numpy as np

# set working directory for da_data_repo -- replace the
os.chdir('C:/workspace/stata/')

# location folders
data_in = "./cps-earnings/raw/"
data_out = "./cps-earnings/clean/"

In [None]:
# load dataset (as unicode, to avoid size and memor warnings)

df = pd.read_csv(
    data_in + "morg2014.csv",
    quotechar='"',
    delimiter=",",
    encoding="utf-8",
    dtype = "unicode"
)

In [None]:
# select a subset of columns
df= df[
    [
        "lfsr94",
        "hhid",
        "lineno",
        "intmonth",
        "stfips",
        "weight",
        "earnwke",
        "uhourse",
        "grade92",
        "race",
        "ethnic",
        "age",
        "sex",
        "marital",
        "ownchild",
        "chldpres",
        "prcitshp",
        "state",
        "ind02",
        "occ2012",
        "class94",
        "unionmme",
        "unioncov",
    ]
]

In [None]:
# rename variables

df.reset_index(drop=True,inplace=True)
df.rename(
    columns={
        "class94": "class",
        "uhourse": "uhours",
    },
    inplace=True,
)

In [None]:
# destring filter variables

df["age"] = pd.to_numeric(df["age"], errors="coerce").astype("Int64")

df["earnwke"] = pd.to_numeric(df["earnwke"], errors="coerce")
df["earnwke"] = df.earnwke.fillna(0)

df["uhours"] = pd.to_numeric(df["uhours"], errors="coerce").astype("Int64")
df["uhours"] = df.uhours.fillna(0)

In [None]:
# filtering dataset

df = df[(df.age >= 16) & (df.age <= 64)]

df = df[(df.lfsr94 == "Employed-At Work") | (df.lfsr94 == "Employed-Absent")]

df.drop(df.loc[(df.earnwke == 0) | (df.uhours == 0)].index, inplace=True)

df.reset_index(drop=True, inplace=True)

In [None]:
# save table

df.to_csv(data_out + "morg-2014-emp.csv", index=False)