In [100]:
import os, sys 
import pandas as pd
import numpy as np
from collections import defaultdict

In [125]:
data_dir = "../../data/ipums/"

In [173]:
HEADER_END = 9
VARIABLE_LIST = [10, 92]
VAL_LIST = [99, 4167]
def in_range(R, l):
    return l >= R[0] and l <= R[1]

In [174]:
lines = ['DUMMY']
with open(os.path.join(data_dir, "codebook.txt"), "r") as in_file:
    line_num = 0
    for line in in_file:
        lines.append(line.strip())

var_cols = {} # the columns containing data for a variable
var_descriptions = {} # the column descriptions for a variable 
var_values = {} # a map of the numeric codes to the values they signify for different variables 

line_num = 0

active_var = None
in_box = False
while True:
    line_num += 1
    if line_num == len(lines):
        break
    line = lines[line_num]
        
    if line_num < HEADER_END:
        continue

    if in_range(VARIABLE_LIST, line_num): 
        items = line.strip().replace("\t", " ").split(" ")
        items = [x for x in items if not x == ""]
        var_name, _, cols, length, _ = items
        length = int(length)
        start_col = int(cols.split('-')[0])
        var_cols[var_name] = list(range(start_col, start_col+length))
    
    if  in_range(VAL_LIST, line_num):
        if len(line) == 0:
            in_box = False
            active_var = None
            continue
        
        line = line.replace("\t", " ")
        items = line.split(" ")
        if in_box:
            if len(items) <2:
                continue
            code = items[0]
            value = ' '.join(items[1:])
            #print("Code (%s): %s" % (code, value))
            var_values[active_var][code] = value
        else:
            # starting new var descriptor 
            var_name = items[0]
            var_desc = ' '.join(items[1:])
            var_descriptions[var_name] = var_desc
            print("Adding Variable (%s): %s" % (var_name, var_desc))
            active_var = var_name
            var_values[active_var] = {}
            in_box = True 

Adding Variable (YEAR): Census year
Adding Variable (SAMPLE): IPUMS sample identifier
Adding Variable (GQ): Group quarters status
Adding Variable (SEX): Sex
Adding Variable (AGE): Age
Adding Variable (MARST): Marital status
Adding Variable (MARRNO): Times married
Adding Variable (RACE): Race [general version]
Adding Variable (RACED): Race [detailed version]
Adding Variable (HISPAN): Hispanic origin [general version]
Adding Variable (HISPAND): Hispanic origin [detailed version]
Adding Variable (BPL): Birthplace [general version]
Adding Variable (BPLD): Birthplace [detailed version]
Adding Variable (CITIZEN): Citizenship status
Adding Variable (YRNATUR): Year naturalized
Adding Variable (YRIMMIG): Year of immigration
Adding Variable (YRSUSA1): Years in the United States
Adding Variable (YRSUSA2): Years in the United States, intervalled
Adding Variable (HCOVANY): Any health insurance coverage
Adding Variable (HCOVPRIV): Private health insurance coverage
Adding Variable (HINSEMP): Health i

In [175]:
data = []
max_count = 100
counter = 0
with open(os.path.join(data_dir, 'usa_00003.dat'), 'r') as in_file:
    for line in in_file:
        counter += 1
        if counter > max_count:
            break
        txt = line.strip()
        row = [x for x in txt]
        data.append(txt)
data = np.array(data)

In [176]:
IGNORE_VARS = [
    'HHWT',    # Household weight 
    'SERIAL', 
    'CBSERIAL',
    'PERNUM',
    'PERWT',
     'IND', 
    'INDNAICS',
    'FTOTINC',
    'INCWAGE',
    'INCSS',
    'PWCOUNTY',
    'SAMPLE',
    'GQ'
]
AS_IS_VARS = [
    'BIRTHYR', # Year in which respondent was born
    'YRMARR', # Year in which respondent was married
    'INCTOT', # each respondent's total pre-tax personal income or losses from all sources for the previous year  
    'INCWELFR', # income from welfare programs
    'INCINVST', # how much pre-tax money the respondent received or lost during the previous year in the form of income from an estate or trust
    'POVERTY', # 3-digit numeric code expressing each family's total income for the previous year as a percentage of the poverty thresholds 
    'YRSUSA1', # 2-digit numeric code reporting how long a person who was born in a foreign country or U.S. outlying area had been living in the United States
    'YEAR'
]

In [177]:
data_dict = defaultdict(list)
for record in data:
    for var, indices in var_cols.items():
        if var in IGNORE_VARS:
            continue
        code = record[indices[0]-1:indices[-1]]
        print(var, code)
        if var in AS_IS_VARS:
            value = code
        else:
            value = var_values[var][code]
        data_dict[var].append(value)

YEAR 2017
SEX 1
AGE 073
MARST 3
BIRTHYR 1944
MARRNO 1
YRMARR 1979
RACE 2
RACED 200
HISPAN 0
HISPAND 000
BPL 001
BPLD 00100
CITIZEN 0
YRNATUR 9999
YRIMMIG 0000
YRSUSA1 00
YRSUSA2 0
HCOVANY 2
HCOVPRIV 1
HINSEMP 1
HCOVPUB 2
HINSCAID 1
HINSCARE 2
HINSVA 1
HINSIHS 1
SCHOOL 1
EDUC 02
EDUCD 025
GRADEATT 0
GRADEATTD 00
SCHLTYPE 1
DEGFIELD 00
DEGFIELDD 0000
DEGFIELD2 00
DEGFIELD2D 0000
EMPSTAT 3
EMPSTATD 30
LABFORCE 1
OCC 0000


KeyError: '0000'

In [89]:
data = pd.DataFrame.from_dict(data_dict)

In [123]:
pd.set_option('display.max_columns', None)  
data.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,GQ,PERNUM,PERWT,SEX,AGE,MARST,BIRTHYR,MARRNO,YRMARR,RACE,RACED,HISPAN,HISPAND,BPL,BPLD,CITIZEN,YRNATUR,YRIMMIG,YRSUSA1,YRSUSA2,HCOVANY,HCOVPRIV,HINSEMP,HCOVPUB,HINSCAID,HINSCARE,HINSVA,HINSIHS,SCHOOL,EDUC,EDUCD,GRADEATT,GRADEATTD,SCHLTYPE,DEGFIELD,DEGFIELDD,DEGFIELD2,DEGFIELD2D,EMPSTAT,EMPSTATD,LABFORCE,OCC,IND,CLASSWKR,CLASSWKRD,INDNAICS,WKSWORK2,INCTOT,FTOTINC,INCWAGE,INCSS,INCWELFR,INCINVST,INCRETIR,POVERTY,MIGPLAC1,MIGCOUNTY1,VETDISAB,DIFFREM,DIFFPHYS,DIFFMOB,DIFFCARE,DIFFSENS,DIFFEYE,DIFFHEAR,VETSTAT,VETSTATD,VET01LTR,VET90X01,VET75X90,VETVIETN,VET55X64,VETKOREA,VET47X50,VETWWII,PWSTATE2,PWCOUNTY,TRANWORK
0,2017,2017 ACS,1,2017000000016,20600,Households under 1970 definition,1,20600,Male,73,Separated,1944,Married once,1979,Black/African American/Negro,Black/African American/Negro,Not Hispanic,Not Hispanic,Alabama,Alabama,,,,0,,With health insurance coverage,Without private health insurance coverage,No insurance through employer/union,With public health insurance coverage,No insurance through Medicaid,Yes,No insurance through VA,No insurance through Indian Health Service,"No, not in school","Grade 5, 6, 7, or 8",Grade 7,,,Not enrolled,,,,,Not in labor force,Not in Labor Force,"No, not in the labor force",0,0,,,0,,10000,10000,0,10000,0,0,0,85,,0,,Has cognitive difficulty,Has ambulatory difficulty,No independent living difficulty,No,No vision or hearing difficulty,No,No,Not a veteran,No military service,,,N/A or No,N/A (all years) and No),N/A or No,N/A or No,,"N/A; N/A or No (1980, 1990 US)",,0,
1,2017,2017 ACS,2,2017000000031,4500,Households under 1970 definition,1,4500,Female,31,Never married/single,1986,Not Applicable,0,White,White,Not Hispanic,Not Hispanic,Georgia,Georgia,,,,0,,With health insurance coverage,With private health insurance coverage,Has insurance through employer/union,Without public health insurance coverage,No insurance through Medicaid,No,No insurance through VA,No insurance through Indian Health Service,"No, not in school",4 years of college,Bachelor's degree,,,Not enrolled,Medical and Health Sciences and Services,General Medical and Health Services,,,Employed,At work,"Yes, in the labor force",350,8090,Works for wages,"Wage/salary, private",6214,50-52 weeks,38500,38500,38500,0,0,0,0,302,,0,,No cognitive difficulty,No ambulatory difficulty,No independent living difficulty,No,No vision or hearing difficulty,No,No,Not a veteran,No military service,,,N/A or No,N/A (all years) and No),N/A or No,N/A or No,,"N/A; N/A or No (1980, 1990 US)",Alabama,117,"Auto, truck, or van"
2,2017,2017 ACS,3,2017000000061,13600,Households under 1970 definition,1,13600,Male,41,"Married, spouse present",1976,Married twice (or more),2000,White,White,Mexican,Mexican,Georgia,Georgia,,,,0,,With health insurance coverage,With private health insurance coverage,Has insurance through employer/union,With public health insurance coverage,Has insurance through Medicaid,No,No insurance through VA,No insurance through Indian Health Service,"No, not in school",Grade 12,GED or alternative credential,,,Not enrolled,,,,,Employed,At work,"Yes, in the labor force",6260,770,Works for wages,"Wage/salary, private",23,50-52 weeks,82000,90700,72000,0,0,10000,0,467,,0,,No cognitive difficulty,No ambulatory difficulty,No independent living difficulty,No,No vision or hearing difficulty,No,No,Not a veteran,No military service,,,N/A or No,N/A (all years) and No),N/A or No,N/A or No,,"N/A; N/A or No (1980, 1990 US)",Georgia,0,"Auto, truck, or van"
3,2017,2017 ACS,3,2017000000061,13600,Households under 1970 definition,2,12100,Female,48,"Married, spouse present",1969,Married twice (or more),2000,White,White,Not Hispanic,Not Hispanic,Illinois,Illinois,,,,0,,With health insurance coverage,With private health insurance coverage,Has insurance through employer/union,With public health insurance coverage,Has insurance through Medicaid,No,No insurance through VA,No insurance through Indian Health Service,"No, not in school",Grade 12,Regular high school diploma,,,Not enrolled,,,,,Not in labor force,Not in Labor Force,"No, not in the labor force",0,0,,,0,,8700,90700,0,0,0,0,0,467,,0,,No cognitive difficulty,No ambulatory difficulty,No independent living difficulty,No,No vision or hearing difficulty,No,No,Not a veteran,No military service,,,N/A or No,N/A (all years) and No),N/A or No,N/A or No,,"N/A; N/A or No (1980, 1990 US)",,0,
4,2017,2017 ACS,3,2017000000061,13600,Households under 1970 definition,3,11100,Male,16,Never married/single,2001,Not Applicable,0,White,White,Mexican,Mexican,Alabama,Alabama,,,,0,,With health insurance coverage,With private health insurance coverage,Has insurance through employer/union,With public health insurance coverage,Has insurance through Medicaid,No,No insurance through VA,No insurance through Indian Health Service,"Yes, in school",Grade 10,Grade 10,Grade 9 to grade 12,Grade 10,Public school,,,,,Not in labor force,Not in Labor Force,"No, not in the labor force",0,0,,,0,,0,90700,0,0,0,0,0,467,,0,,No cognitive difficulty,No ambulatory difficulty,No independent living difficulty,No,No vision or hearing difficulty,No,No,,,,,N/A or No,N/A (all years) and No),N/A or No,N/A or No,,"N/A; N/A or No (1980, 1990 US)",,0,


In [172]:
var_values['OCC']['0000']

KeyError: '0000'