In [1]:
import json
import csv
import re
import numpy as np

In [2]:
with open("variables.json", "r") as f:
    variables = json.load(f)['variables']
variables = variables[:-3]

In [3]:
variableOverrides = {
    'DOEID': 'special',
    'NWEIGHT': 'special',
    'TYPEHUQ': 'special',
    'AIA_Zone': 'categorical',
    'STORIES': 'categorical',
#    'SIZEOFGARAGE': 'coded',
#    'SIZEOFDETACH': 'multi',
#    'OVENUSE': 'coded',
#    'NUMMEAL': 'coded',
#     'NUMFREEZ': 'continuous',
    'SIZRFRI1': 'categorical',
    'SIZRFRI2': 'categorical',
    'SIZRFRI3': 'categorical',
    'SIZFREEZ': 'categorical',
    'SIZFREEZ2': 'categorical',
#    'DWASHUSE': 'coded',
    'DWASHUSE': 'categorical',
    'USENGFP': 'categorical',
    'USEMOISTURE': 'categorical',
    'PERIODEL': 'categorical',
    'PERIODNG': 'categorical',
    'PERIODLP': 'categorical',
    'PERIODFO': 'categorical',
    'PERIODKR': 'categorical',
    'Householder_Race': 'categorical',
    'EDUCATION': 'categorical',
    'USENOTMOIST': 'categorical'
}

In [4]:
def variableType(var):
    if var['name'] in variableOverrides:
        typ = variableOverrides[var['name']]
        if typ == 'special':
            print("Warning: Skipping %s" % name)
        return typ
    if 'Imputation flag' in var['description']:
        return 'special'
    if len(var['codebook']) == 1 and var['codebook'][0][0] == '':
        return 'continuous'
    numerical_count = 0
    for code, desc in var['codebook']:
        if re.search('[0-9]+ - [0-9]+', code):
            return 'continuous'
        if re.search('[0-9]', desc):
            numerical_count += 1
    if len(var['codebook']) > 1:
        if numerical_count > 0:
            return 'coded'
        return 'categorical'
    return None

In [5]:
for var in variables:
    name = var['name']
    typ = variableType(var)
    if typ == 'special':
        continue
#     print("%s: %s" % (typ, name))
#     print(var['description'])
#     for code, desc in var['codebook']:
#         print('\t%s: %s' % (code, desc)) 
#     print()



In [6]:
[var for var in variables if variableOverrides.get(var['name']) == 'special']

[{'codebook': [['00001 - 12083', 'Unique identifier for each respondent']],
  'description': 'Unique identifier for each respondent',
  'name': 'DOEID',
  'position': 0},
 {'codebook': [['1', 'Mobile Home'],
   ['2', 'Single-Family Detached'],
   ['3', 'Single-Family Attached'],
   ['4', 'Apartment in Building with 2 - 4 Units'],
   ['5', 'Apartment in Building with 5+ Units']],
  'description': 'Type of housing unit',
  'name': 'TYPEHUQ',
  'position': 4},
 {'codebook': [['', 'Final sample weight']],
  'description': 'Final sample weight',
  'name': 'NWEIGHT',
  'position': 5}]

In [10]:
normalizedSpecial = [{
    'name': 'num units',
    'type': 'Coded',
    'description': 'Coarse number of units in building',
    'edges': [0, 1, 1, 2, 5, np.inf],
    'values': [1, 2, 3, 4, 5],
    'field': 'TYPEHUQ'
},{
    'name': 'building type',
    'description': 'Building Type',
    'labels': [
        'mobile home',
        'single family detached',
        'single family attached',
        'apartment',
        'apartment'
    ],
    'values': [1, 2, 3, 4, 5],
    'na_values': [''],
    'field': 'TYPEHUQ'
}]

In [11]:
na_phrases = {
    'Not Sure', 'Not Applicable', 'Refuse', "Don't Know", 'Not applicable',
    'Not applicable, no LPG/propane consumption',
    'Not applicable, no fuel oil consumption',
    'Not applicable, no kerosene consumption',
    'Not applicable, no natural gas consumption'
}

In [12]:
def parseDefault(var):
    out = {
        'name': var['name'],
        'description': var['description'],
        'field': var['name'],
        'na_values': ['']
    }
    for code, desc in var['codebook']:
        if desc in na_phrases:
            out['na_values'].append(code)
            continue
        try:
            code = int(code)
            if code < 0:
                print(desc)
        except Exception:
            pass
    return out

In [13]:
def parseContinuous(var):
    normalized = parseDefault(var)
    normalized['type'] = 'Continuous'
    
    codebook = var['codebook']
    if len(codebook) >= 1:
        match = re.match('([0-9\.]+) - ([0-9\.]+)', codebook[0][0])
        if match:
            normalized['min'] = eval(match.group(1))
            normalized['max'] = eval(match.group(2))
    for code, desc in codebook[1:]:
        if code not in normalized['na_values']:
            print(normalized)
            print('Malformed continuous variable')
            break
    return normalized

In [14]:
specialCategories = {'METRO', 'MICRO', 'NONE', 'U', 'R'}

def parseCategorical(var):
    normalized = parseDefault(var)
    normalized['values'] = []
    normalized['labels'] = []
    for code, desc in var['codebook']:
        if code in normalized['na_values']:
            continue
        if code not in specialCategories:
            code = int(code)
            if code < 0:
                if code not in normalized['na_values']:
                    print(desc)
                continue
        normalized['values'].append(code)
        normalized['labels'].append(desc)
    normalized['type'] = 'Categorical'
    return normalized

In [15]:
num_pattern = "[0-9]+(?:\.[0-9]+)?"

dollar_pattern = "\\$[0-9,]+(?:\\.[0-9]+)?"

exact_patterns = [
    '^(%s)$' % num_pattern
]

zero_patterns = [
    'none'
]

bottom_patterns = [
    'less than (%s)' % dollar_pattern,
    'before (%s)' % num_pattern,
    'prior to (%s)' % num_pattern,
    'less than (%s)' % num_pattern,
    '(%s) (?:[^ ]* )?or less' % num_pattern,
    '< (%s)' % num_pattern,
    'in the last (%s)' % num_pattern,
]

top_patterns = [
    '(%s) or more' % dollar_pattern,
    'more than (%s)' % num_pattern,
    '(%s) (?:[^ ]* )?or older' % num_pattern,
    '(%s) (?:[^ ]* )?or more' % num_pattern,
    
]

range_patterns = [
    '(%s)\-(%s)' % (num_pattern, num_pattern),
    '(%s) \- (%s)' % (num_pattern, num_pattern),
    '(%s) to (%s)' % (num_pattern, num_pattern),
    '(%s) to (%s)' % (dollar_pattern, dollar_pattern),
    '(%s) or (%s)' % (num_pattern, num_pattern),
    'between (%s) and (%s)' % (num_pattern, num_pattern),
]

In [16]:
def matchCodeRange(desc):
    for patt in bottom_patterns:
        match = re.search(patt, desc)
        if match:
            return ('range', -np.inf, match.group(1))
        
    for patt in top_patterns:
        match = re.search(patt, desc)
        if match:
            return ('range', match.group(1), np.inf)
        
    for patt in range_patterns:
        match = re.search(patt, desc)
        if match:
            return ('range', match.group(1), match.group(2))
        
    for patt in exact_patterns:
        match = re.search(patt, desc)
        if match:
            return ('exact', match.group(1))
        
    for patt in zero_patterns:
        match = re.search(patt, desc)
        if match:
            return ('exact', 0)
        
    print("Could not match %s" % desc)

In [17]:
def to_numeric(s):
    if isinstance(s, str):
        return eval(s.replace('$','').replace(',',''))
    else:
        return s

def parseCoded(var):
    normalized = parseDefault(var)
    normalized['type'] = 'Coded'
    codebook = [(code, desc) for code, desc in var['codebook']
                if (code not in normalized['na_values'])]
    edge_tuples = []
    for i, t in enumerate(codebook):
        code, desc = t
        code = int(code)
        desc = desc.lower()
        match = matchCodeRange(desc)
        if match[0] == 'exact':
            edge_tuples.append((to_numeric(match[1]), to_numeric(match[1])))
        else:
            edge_tuples.append((to_numeric(match[1]), to_numeric(match[2])))
    for a, b in edge_tuples:
        if a > b:
            raise Exception('Descending ranges')
    order = np.argsort([a for a, b in edge_tuples])
    edge_tuples = np.array(edge_tuples)
    edge_tuples = edge_tuples[order].astype('d')
    for i in range(len(edge_tuples) - 1):
        if edge_tuples[i][1] > edge_tuples[i + 1][0]:
            raise Exception('Descending ranges')
    inner_edges = np.maximum(edge_tuples[1:, 0], edge_tuples[:-1, 1])
    edges = [edge_tuples[0][0]] + inner_edges.tolist() + [edge_tuples[-1][1]]
    normalized['edges'] = edges
    normalized['values'] = []
    for i in order:
        normalized['values'].append(codebook[i][0])
    return normalized

In [18]:
codebookParsers = {
    'continuous': parseContinuous,
    'categorical': parseCategorical,
    'coded': parseCoded,
}

In [19]:
normalizedVariables = normalizedSpecial[:]

for var in variables:
    name = var['name']
    typ = variableType(var)
    if typ == 'special':
        continue
    if typ in codebookParsers:
        normalized = codebookParsers[typ](var)
        normalizedVariables.append(normalized)



In [23]:
with open("normalized_variables.json", "w") as f:
    json.dump(normalizedVariables, f, sort_keys=True, indent=2)