In [43]:
import pandas as pd
import re

In [49]:
def map_code_values(category_code_list, print_errors=False):
    equal_split = re.compile(r'\s?=\s?')
    cont_num_range = re.compile(r'^\d+\,?\d+?\s?-\s?\d+\,?\d+|^\d+\s?-\s?\d+\,?\d+|^\d+?\s?-\s?\d+')
    code_map = {}
    error_map = {}
    i = 0 # col index
    for code_list in category_code_list:
        # check codes to follow "num = category pattern"
        if 'no code' in code_list:
            code_list = ['no code = continuous numerical value']
        if bool(cont_num_range.search(code_list[0])) == True:
            # if numerical range is first ie "1 - 10"
            code_list[0] = code_list[0] + ' = continuous numerical range'
        elif bool(cont_num_range.search(code_list[-1])) == True:
            # if numerical range is last ie "1946 - 2012"
            code_list[-1] = code_list[-1] + ' = continuous numerical range'
        elif 'continued' in code_list[-1]:
            # if (continued in next cell) is last, delete from list
            del code_list[-1]

        # convert codes to dict
        try:
            code_map[i] = dict([re.split(equal_split.pattern, code.replace("'", "").strip()) for code in code_list])
        except ValueError as e:
            error_map[i] = (code_list, e)
        
        # increment col index
        i += 1
    if print_errors == True:
        for k,v in error_map.items():
            print(f'codelist: {k} | error: {v}\n-----------')
    return code_map

In [50]:
codebook_col_map = {
    'CATEOGORIES': 'category', 
    'File order': 'file_order', 
    'Variable\nname': 'col_name', 
    'Variable type': 'col_type',
    'Length': 'length', 
    'Label': 'col_description', 
    'Values/Format codes': 'codes', 
    'Unnamed: 8': 'empty_col' # (continued in next cell) code prompt?
}
#df_codebook = pd.read_csv('2018microdata_codebook.csv').rename(columns=codebook_col_map).drop(['empty_col'], axis=1)
df_codebook = pd.read_csv('2018microdata_codebook.csv').rename(columns=codebook_col_map)
df_codebook.head(5)

Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes
0,0,1,PUBID,Char,5,Public use file building identifier,00001 - 06436
1,0,2,REGION,Char,1,Census Region,1=Northeast\n2=Midwest\n3=South\n4=West
2,0,3,CENDIV,Char,1,Census division,1=New England\n2=Middle Atlantic \n3=East Nor...
3,0,4,PBA,Num,8,Principal building activity,1=Vacant\n2=Office\n4=Laboratory\n5=Nonrefrige...
4,1,5,SQFT,Num,8,Square footage,"1,001 - 2,100,000"


In [51]:
categories = df_codebook.category.unique()
print(f'categories: {categories}')
category_map = {}
category_dfs = {}
for cat_name in categories:
    if cat_name != 'get rid of':
        category_map[cat_name] = df_codebook[df_codebook['category'] == cat_name].col_name.values
        category_dfs[cat_name] = df_codebook[df_codebook['category'] == cat_name]
print(f"category = 1 columns: {category_map['0']}")
category_dfs['0']

categories: ['0' '1' 'get rid of' '2' '3' '4' '5' '6' '7']
category = 1 columns: ['PUBID' 'REGION' 'CENDIV' 'PBA']


Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes
0,0,1,PUBID,Char,5,Public use file building identifier,00001 - 06436
1,0,2,REGION,Char,1,Census Region,1=Northeast\n2=Midwest\n3=South\n4=West
2,0,3,CENDIV,Char,1,Census division,1=New England\n2=Middle Atlantic \n3=East Nor...
3,0,4,PBA,Num,8,Principal building activity,1=Vacant\n2=Office\n4=Laboratory\n5=Nonrefrige...


In [52]:
# only category 10 has missing codes
# category_dfs['10'].loc[:,'codes'] = category_dfs['10'].codes.fillna('no code')
# category_dfs['10'].head()

In [53]:
all_cats_code_maps = {}
for cat_num, df in category_dfs.items(): 
    print(f'category: {cat_num}')
    if cat_num != 'get rid of':
        i = 0
        category_code_df = category_dfs[cat_num].drop(['category', 'file_order', 'col_type', 'length',], axis=1)
        category_code_df['code_split'] = category_code_df.codes.str.split('\n')
        category_code_list = category_code_df.code_split.values
        category_cols = category_code_df.columns
        category_code_map = map_code_values(category_code_list, print_errors=True)
        for col_name in category_code_df.col_name.values:
            category_code_map[col_name] = category_code_map.pop(i)
            i += 1
        all_cats_code_maps[cat_num] = category_code_map

category: 0
category: 1
category: 2
category: 3
category: 4
category: 5
category: 6
category: 7


In [54]:
for cat_num, code_map in all_cats_code_maps.items():
    print(cat_num)
    print(code_map)
    print('---------------------------------------------------------------------------------------------------------')

0
{'PUBID': {'00001 - 06436': 'continuous numerical range'}, 'REGION': {'1': 'Northeast', '2': 'Midwest', '3': 'South', '4': 'West'}, 'CENDIV': {'1': 'New England', '2': 'Middle Atlantic', '3': 'East North Central', '4': 'West North Central', '5': 'South Atlantic', '6': 'East South Central', '7': 'West South Central', '8': 'Mountain', '9': 'Pacific'}, 'PBA': {'1': 'Vacant', '2': 'Office', '4': 'Laboratory', '5': 'Nonrefrigerated warehouse', '6': 'Food sales', '7': 'Public order and safety', '8': 'Outpatient health care', '11': 'Refrigerated warehouse', '12': 'Religious worship', '13': 'Public assembly', '14': 'Education', '15': 'Food service', '16': 'Inpatient health care', '17': 'Nursing', '18': 'Lodging', '23': 'Strip shopping center', '24': 'Enclosed mall', '25': 'Retail other than mall', '26': 'Service', '91': 'Other'}}
---------------------------------------------------------------------------------------------------------
1
{'SQFT': {'1,001 - 2,100,000': 'continuous numerical ran

In [55]:
new_codes_df = pd.DataFrame(all_cats_code_maps)
print(new_codes_df.columns)
new_codes_df['0'].dropna()

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')


PUBID       {'00001 - 06436': 'continuous numerical range'}
REGION    {'1': 'Northeast', '2': 'Midwest', '3': 'South...
CENDIV    {'1': 'New England', '2': 'Middle Atlantic', '...
PBA       {'1': 'Vacant', '2': 'Office', '4': 'Laborator...
Name: 0, dtype: object

In [56]:
to_add_dfs = []
for category_num in new_codes_df.columns:
    to_add = new_codes_df[category_num].dropna().reset_index().rename(columns={'index':'col_name', category_num:'codes_dict'})
    to_add_dfs.append(to_add)
to_merge = pd.concat(to_add_dfs)

In [57]:
new_codebook = df_codebook.merge(to_merge, on='col_name')
new_codebook#.drop_duplicates(subset='col_name')

Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes,codes_dict
0,0,1,PUBID,Char,5,Public use file building identifier,00001 - 06436,{'00001 - 06436': 'continuous numerical range'}
1,0,2,REGION,Char,1,Census Region,1=Northeast\n2=Midwest\n3=South\n4=West,"{'1': 'Northeast', '2': 'Midwest', '3': 'South..."
2,0,3,CENDIV,Char,1,Census division,1=New England\n2=Middle Atlantic \n3=East Nor...,"{'1': 'New England', '2': 'Middle Atlantic', '..."
3,0,4,PBA,Num,8,Principal building activity,1=Vacant\n2=Office\n4=Laboratory\n5=Nonrefrige...,"{'1': 'Vacant', '2': 'Office', '4': 'Laborator..."
4,1,5,SQFT,Num,8,Square footage,"1,001 - 2,100,000","{'1,001 - 2,100,000': 'continuous numerical ra..."
...,...,...,...,...,...,...,...,...
556,7,558,TINT,Num,8,Tinted window glass,1=Yes\n2=No\nMissing=Not applicable,"{'1': 'Yes', '2': 'No', 'Missing': 'Not applic..."
557,7,559,REFL,Num,8,Reflective window glass,1=Yes\n2=No\nMissing=Not applicable,"{'1': 'Yes', '2': 'No', 'Missing': 'Not applic..."
558,7,560,AWN,Num,8,External overhangs or awnings,1=Yes\n2=No\nMissing=Not applicable,"{'1': 'Yes', '2': 'No', 'Missing': 'Not applic..."
559,7,561,SKYLT,Num,8,Skylights or atriums designed to provide light,1=Yes\n2=No\nMissing=Not applicable,"{'1': 'Yes', '2': 'No', 'Missing': 'Not applic..."


In [58]:
new_codebook.to_csv('2018cbecs_codebook_wDicts.csv', index=False)