In [17]:
import pandas as pd
import re

In [18]:
def map_code_values(category_code_list, print_errors=False):
    equal_split = re.compile(r'\s?=\s?')
    cont_num_range = re.compile(r'\d+\s?-\s?\d+')
    code_map = {}
    error_map = {}
    i = 0 # col index
    for code_list in category_code_list:
        # check codes to follow "num = category pattern"
        if 'no code' in code_list:
            code_list = ['no code = continuous numerical value']
        if bool(cont_num_range.search(code_list[0])) == True:
            # if numerical range is first ie "1 - 10"
            code_list[0] = code_list[0] + ' = continuous numerical range'
        elif bool(cont_num_range.search(code_list[-1])) == True:
            # if numerical range is last ie "1946 - 2012"
            code_list[-1] = code_list[-1] + ' = continuous numerical range'
        elif 'continued' in code_list[-1]:
            # if (continued in next cell) is last, delete from list
            del code_list[-1]

        # convert codes to dict
        try:
            code_map[i] = dict([re.split(equal_split.pattern, code.replace("'", "").strip()) for code in code_list])
        except ValueError as e:
            error_map[i] = (code_list, e)
        
        # increment col index
        i += 1
    if print_errors == True:
        for k,v in error_map.items():
            print(f'codelist: {k} | error: {v}\n-----------')
    return code_map

In [24]:
codebook_col_map = {
    'CATEOGORIES': 'category', 
    'File order': 'file_order', 
    'Variable\nname': 'col_name', 
    'Variable type': 'col_type',
    'Length': 'length', 
    'Format': 'format', 
    'Label': 'col_description', 
    'Values/Format codes': 'codes', 
    'Unnamed: 8': 'empty_col' # (continued in next cell) code prompt?
}
#df_codebook = pd.read_csv('2018microdata_codebook.csv').rename(columns=codebook_col_map).drop(['empty_col'], axis=1)
df_codebook = pd.read_csv('2018microdata_codebook.csv').rename(columns=codebook_col_map)
df_codebook.head(5)

Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes
0,0,1,PUBID,Char,5,Public use file building identifier,00001 - 06436
1,0,2,REGION,Char,1,Census Region,1=Northeast\n2=Midwest\n3=South\n4=West
2,0,3,CENDIV,Char,1,Census division,1=New England\n2=Middle Atlantic \n3=East Nor...
3,0,4,PBA,Num,8,Principal building activity,1=Vacant\n2=Office\n4=Laboratory\n5=Nonrefrige...
4,1,5,SQFT,Num,8,Square footage,"1,001 - 2,100,000"


In [25]:
categories = df_codebook.category.unique()
print(f'categories: {categories}')
category_map = {}
category_dfs = {}
for cat_name in categories:
    if cat_name != 'get rid of':
        category_map[cat_name] = df_codebook[df_codebook['category'] == cat_name].col_name.values
        category_dfs[cat_name] = df_codebook[df_codebook['category'] == cat_name]
print(f"category = 1 columns: {category_map['0']}")
category_dfs['0']

categories: ['0' '1' nan '2' '3' '4' '5' '6' '7' 'get rid of']
category = 1 columns: ['PUBID' 'REGION' 'CENDIV' 'PBA']


Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes
0,0,1,PUBID,Char,5,Public use file building identifier,00001 - 06436
1,0,2,REGION,Char,1,Census Region,1=Northeast\n2=Midwest\n3=South\n4=West
2,0,3,CENDIV,Char,1,Census division,1=New England\n2=Middle Atlantic \n3=East Nor...
3,0,4,PBA,Num,8,Principal building activity,1=Vacant\n2=Office\n4=Laboratory\n5=Nonrefrige...


In [26]:
# NOTE: some codes span two rows
# need to find a way to concatenate this, then update map_codes function '(continued in next cell)' condition
category_dfs['1'].iloc[44:48]

Unnamed: 0,category,file_order,col_name,col_type,length,col_description,codes
49,1,50,FACIL,Num,8,On a multibuilding complex,1=Yes\n2=No
50,1,51,FEDFAC,Num,8,Federal complex,1=Yes\n2=No\nMissing=Not applicable
51,1,52,FACACT,Num,8,Type of complex,"1=College, university, or junior college\n2=Pr..."
52,1,53,MANIND,Num,8,Manufacturing industrial,1=Yes\n2=No\nMissing=Not applicable


In [28]:
# only category 10 has missing codes
# category_dfs['10'].loc[:,'codes'] = category_dfs['10'].codes.fillna('no code')
# category_dfs['10'].head()

In [30]:
# all_cats_code_maps = {}
# for cat_num, df in category_dfs.items(): 
#     print(f'category: {cat_num}')
#     if cat_num != 'get rid of':
#         i = 0
#         category_code_df = category_dfs[cat_num].drop(['category', 'file_order', 'col_type', 'length', 'format',], axis=1)
#         category_code_df['code_split'] = category_code_df.codes.str.split('\n')
#         category_code_list = category_code_df.code_split.values
#         category_cols = category_code_df.columns
#         category_code_map = map_code_values(category_code_list, print_errors=True)
#         for col_name in category_code_df.col_name.values:
#             category_code_map[col_name] = category_code_map.pop(i)
#             i += 1
#         all_cats_code_maps[cat_num] = category_code_map

In [33]:
# for cat_num, code_map in all_cats_code_maps.items():
#     print(cat_num)
#     print(code_map)
#     print('---------------------------------------------------------------------------------------------------------')

In [34]:
# new_codes_df = pd.DataFrame(all_cats_code_maps)
# print(new_codes_df.columns)
# new_codes_df['0'].dropna()

In [36]:
# to_add_dfs = []
# for category_num in new_codes_df.columns:
#     to_add = new_codes_df[category_num].dropna().reset_index().rename(columns={'index':'col_name', category_num:'codes_dict'})
#     to_add_dfs.append(to_add)
# to_merge = pd.concat(to_add_dfs)

In [37]:
# new_codebook = df_codebook.merge(to_merge, on='col_name')
# new_codebook#.drop_duplicates(subset='col_name')

In [13]:
new_codebook.to_csv('2018cbecs_codebook_wDicts.csv', index=False)