In [1]:
import pandas as pd
import re

In [2]:
def map_code_values(category_code_list, print_errors=False):
    equal_split = re.compile(r'\s?=\s?')
    cont_num_range = re.compile(r'\d+\s?-\s?\d+')
    code_map = {}
    error_map = {}
    i = 0 # col index
    for code_list in category_code_list:
        # check codes to follow "num = category pattern"
        if 'no code' in code_list:
            code_list = ['no code = continuous numerical value']
        if bool(cont_num_range.search(code_list[0])) == True:
            # if numerical range is first ie "1 - 10"
            code_list[0] = code_list[0] + ' = continuous numerical range'
        elif bool(cont_num_range.search(code_list[-1])) == True:
            # if numerical range is last ie "1946 - 2012"
            code_list[-1] = code_list[-1] + ' = continuous numerical range'
        elif 'continued' in code_list[-1]:
            # if (continued in next cell) is last, delete from list
            del code_list[-1]

        # convert codes to dict
        try:
            code_map[i] = dict([re.split(equal_split.pattern, code.replace("'", "").strip()) for code in code_list])
        except ValueError as e:
            error_map[i] = (code_list, e)
        
        # increment col index
        i += 1
    if print_errors == True:
        for k,v in error_map.items():
            print(f'codelist: {k} | error: {v}\n-----------')
    return code_map

In [3]:
codebook_col_map = {
    'CATEOGORIES': 'category', 
    'File order': 'file_order', 
    'Variable\r\nname': 'col_name', 
    'Variable type': 'col_type',
    'Len-\r\ngth': 'length', 
    'Format': 'format', 
    'Label': 'col_description', 
    'Values/Format codes': 'codes', 
    'Unnamed: 8': 'empty_col' # (continued in next cell) code prompt?
}
df_codebook = pd.read_csv('grouped_2012microdata_codebook.csv').rename(columns=codebook_col_map).drop(['empty_col'], axis=1)
df_codebook.head(5)

Unnamed: 0,category,file_order,col_name,col_type,length,format,col_description,codes
0,0,1.0,PUBID,Char,5.0,,Building identifier,00001 - 06720
1,0,2.0,REGION,Char,1.0,$REGION.,Census region,'1' = 'Northeast'\r\n'2' = 'Midwest'\r\n'3' = ...
2,0,3.0,CENDIV,Char,1.0,$CENDIV.,Census division,'1' = 'New England'\r\n'2' = 'Middle Atlantic'...
3,0,4.0,PBA,Char,2.0,$PBA.,Principal building activity,'01' = 'Vacant'\r\n'02' = 'Office'\r\n'04' = '...
4,1,5.0,FREESTN,Char,1.0,$YES.,Freestanding building,1' = 'Yes'\r\nMissing='No'


In [4]:
categories = df_codebook.category.unique()
print(f'categories: {categories}')
category_map = {}
category_dfs = {}
for cat_name in categories:
    category_map[cat_name] = df_codebook[df_codebook['category'] == cat_name].col_name.values
    category_dfs[cat_name] = df_codebook[df_codebook['category'] == cat_name]
print(f"category = 1 columns: {category_map['0']}")
category_dfs['0']

categories: ['0' '1' '2' '3' '4' '5' '6' '7' 'get rid of' '10']
category = 1 columns: ['PUBID' 'REGION' 'CENDIV' 'PBA']


Unnamed: 0,category,file_order,col_name,col_type,length,format,col_description,codes
0,0,1.0,PUBID,Char,5.0,,Building identifier,00001 - 06720
1,0,2.0,REGION,Char,1.0,$REGION.,Census region,'1' = 'Northeast'\r\n'2' = 'Midwest'\r\n'3' = ...
2,0,3.0,CENDIV,Char,1.0,$CENDIV.,Census division,'1' = 'New England'\r\n'2' = 'Middle Atlantic'...
3,0,4.0,PBA,Char,2.0,$PBA.,Principal building activity,'01' = 'Vacant'\r\n'02' = 'Office'\r\n'04' = '...


In [5]:
# NOTE: some codes span two rows
# need to find a way to concatenate this, then update map_codes function '(continued in next cell)' condition
category_dfs['1'].iloc[45:47]

Unnamed: 0,category,file_order,col_name,col_type,length,format,col_description,codes
49,1,50.0,PBAPLUS,Char,2.0,$PBAPLUS.,More specific building activity,'01' = 'Vacant'\r\n'02' = 'Administrative/prof...
50,1,,,,,,"More specific building activity, continued",'28' = 'Elementary/middle school'\r\n'29' = 'H...


In [6]:
# only category 10 has missing codes
category_dfs['10'].loc[:,'codes'] = category_dfs['10'].codes.fillna('no code')
category_dfs['10'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,category,file_order,col_name,col_type,length,format,col_description,codes
1049,10,1049.0,HDD65,Num,8.0,,Heating degree days (base 65),no code
1050,10,1050.0,CDD65,Num,8.0,,Cooling degree days (base 65),no code
1051,10,1051.0,MFUSED,Char,1.0,$YESNO.,Any major fuel used,'1' = 'Yes'\r\n'2' = 'No'\r\nMissing = Not app...
1052,10,1052.0,MFBTU,Num,8.0,,Annual major fuel consumption (thous Btu),no code
1053,10,1053.0,MFEXP,Num,8.0,,Annual major fuel expenditures ($),no code


In [7]:
all_cats_code_maps = {}
for cat_num, df in category_dfs.items(): 
    print(f'category: {cat_num}')
    if cat_num == 'get rid of':
        all_cats_code_maps[cat_num] = 'no_code_map'
    else:
        category_code_df = category_dfs[cat_num].drop(['category', 'file_order', 'col_type', 'length', 'format',], axis=1)
        category_code_df['code_split'] = category_code_df.codes.str.split('\r\n')
        category_code_list = category_code_df.code_split.values
        category_code_map = map_code_values(category_code_list, print_errors=True)
        all_cats_code_maps[cat_num] = category_code_map
print(all_cats_code_maps['0'])

category: 0
category: 1
category: 2
category: 3
category: 4
category: 5
category: 6
category: 7
category: get rid of
category: 10
{0: {'00001 - 06720': 'continuous numerical range'}, 1: {'1': 'Northeast', '2': 'Midwest', '3': 'South', '4': 'West'}, 2: {'1': 'New England', '2': 'Middle Atlantic', '3': 'East North Central', '4': 'West North Central', '5': 'South Atlantic', '6': 'East South Central', '7': 'West South Central', '8': 'Mountain', '9': 'Pacific'}, 3: {'01': 'Vacant', '02': 'Office', '04': 'Laboratory', '05': 'Nonrefrigerated warehouse', '06': 'Food sales', '07': 'Public order and safety', '08': 'Outpatient health care', '11': 'Refrigerated warehouse', '12': 'Religious worship', '13': 'Public assembly', '14': 'Education', '15': 'Food service', '16': 'Inpatient health care', '17': 'Nursing', '18': 'Lodging', '23': 'Strip shopping mall', '24': 'Enclosed mall', '25': 'Retail other than mall', '26': 'Service', '91': 'Other'}}
