# Normalization
Here we normalize the census variables to a format that is easy to interpret.

Census variables come in the format:

`TENURE BY AGE OF HOUSEHOLDER BY OCCUPANTS PER ROOM% Owner occupied:% Householder 15 to 34 years:% 1.01 to 1.50 occupants per room`

Each field is separated by a `%`. There are typically summary variables for all subfields of a particular field. For example, the following variables also exist:

`TENURE BY AGE OF HOUSEHOLDER BY OCCUPANTS PER ROOM% Owner occupied:% Householder 15 to 34 years:`

`TENURE BY AGE OF HOUSEHOLDER BY OCCUPANTS PER ROOM% Owner occupied:`

`TENURE BY AGE OF HOUSEHOLDER BY OCCUPANTS PER ROOM for Occupied housing units`

The last variable is the root variable, which names the total population described by the table (all of its subfields).

The number of fields is the depth of that particular variable. Within any census table, there are variables of many different depths. Sometimes the data is tabular and the maximum depth is the same for all parts of the table. Other times, the data is not tabular.

## Variable Selection
We load variables.json as our schema. We use model-variables.csv to select only the variables we want to normalize. To initially populate model-variables.csv, use the code below to get all variables from a subset of tables deemed to be relevant.

In [1]:
# desiredTables = ["B08006",
# "B19001",
# "B19051",
# "B19054",
# "B19056",
# "B19057",
# "B19059",
# "B19060",
# "B22010",
# "B25003",
# "B25006",
# "B25007",
# "B25009",
# "B25013",
# "B25020",
# "B25024",
# "B25032",
# "B25036",
# "B25038",
# "B25040",
# "B25042",
# "B25069",
# "C17002"]

# for tableId in desiredTables:
#     for varId in data['tables'][tableId]['variables']:
#         print(varId)

Refine these selections in select.html so that the data is tabular. Once the right variables have been selected, load the selections.

In [2]:
import json
import re
import numpy as np

with open("variables.json") as f:
    data = json.load(f)
    
with open("model-variables.csv") as f:
    selected_variables = set(line.strip() for line in f.readlines())

Select the tables whose variables are relevant

In [3]:
tables = []

for tableId in data['tableIds']:
    table = data['tables'][tableId]
    
    # Get all variables for this table that have been selected
    varlist = [
        data['variables'][variableId] for variableId in table['variables']
        if (variableId in selected_variables)]
    if len(varlist) == 0:
        continue
    tables.append((table, varlist))

# Initial Parsing

The goal of this section is to turn flat lists of census variables into n-d tables of variables

In [4]:
# Normalize the variable names given in the census data to eliminate extraneous characters

def normalize_desc(d):
    return d.strip(" :%")

def normalize_field(f):
    f = f.strip(" :-").lower()
    return re.sub("\s+", " ", f)

In [5]:
# Check if a list of variables can be interpreted as an n-d array

def tabular(varlist):
    cross_product = set(var.fields for var in varlist)
    labels = [set(s) for s in zip(*cross_product)]
    full_cross_product = set(itertools.product(*labels))
    cross_product = set(var.fields for var in varlist)
    return full_cross_product == cross_product

In [6]:

class Variable:
    """The variable class is structured like a tree. Each element has some number of children.
    The goal of this class is to take this tree structure and convert it into a tabular form.
    """
    def __init__(self, fields, id):
        self.fields = tuple(fields)
        self.id = id
        self.key = fields[-1]
        self.depth = len(fields) - 1
        self.parent = None
        self.children = []
        self.child_keys = []
        
    def add_child(self, child):
        self.children.append(child)
        self.child_keys.append(child.key)
        child.parent = self
        
    def get_max_depth(self):
        if len(self.children) > 0:
            self.max_depth = max(child.get_max_depth() for child in self.children)
        else:
            self.max_depth = self.depth
        return self.max_depth
    
    def get_min_depth(self):
        if len(self.children) > 0:
            self.min_depth = min(child.get_min_depth() for child in self.children)
        else:
            self.min_depth = self.depth
        return self.min_depth
    
    def deepest(self, clip_depth = None):
        """Returns a list of the deepest elements in this table, skipping summary counts"""
        self.get_max_depth()
        if clip_depth is None:
            clip_depth = self.get_min_depth()
        for child in self.walk():
            if child.depth == child.max_depth:
                yield child.clip_id_to_depth(clip_depth)
                
    def table(self, clip_depth = None):
        """Returns a list of the deepest elements in this table in tabular form as a numpy array"""
        if clip_depth is None:
            clip_depth = self.get_min_depth()
        t = self._table(clip_depth)
        if not isinstance(t, np.ndarray):
            t = np.array(t)
        return t
        
    def _table(self, clip_depth):
        """Recursive function call for table"""
        if self.depth == self.max_depth:
            return self.clip_id_to_depth(clip_depth)
        else:
            return np.array([child._table(clip_depth) for child in self.children])
    
    def walk(self):
        """Generator for all elements in this tree"""
        yield self
        for child in self.children:
            for var in child.walk():
                yield var
    
    def clip_id_to_depth(self, depth):
        """Combine any fields past a certain depth into one combination field.
        
        For example ('male', 20 to 25 years', 'income > 5000') would be clipped to
        ('male', '20 to 25 years - income > 5000')
        
        This is a way of making a tree tabular if it is partially tabular (a field always
        has subfields, but other fields at the same depth never have subfields).
        """
        new_fields = self.fields[:depth] + ("-".join(self.fields[depth:]),)
        return Variable(new_fields, self.id)
    
    def filter_to_depth(self, depth):
        """Eliminate any fields beyond a certain depth."""
        new = Variable(self.fields, self.id)
        if self.depth < depth:
            for child in self.children:
                new.add_child(child.filter_to_depth(depth))
        return new

    def __repr__(self):
        return "<Variable " + "%".join(self.fields) + ">"

In [7]:
import itertools

def parse_table(table, varlist):
    """Return a numpy array of Variable objects.
    
    Parameters
    ----------
    table: dict from `variables.json`
    
    varlist: list of Variable objects
    """
    
    # Map from fields to variable
    all_variables = {}
    
    for var in varlist:
        # Parse the variable names from the census schema
        desc = normalize_desc(var['description'])
        fields = [normalize_field(f) for f in desc.split('%')]
        
        # Avoid issues with root variable
        fields[0] = table['description']
        if fields[-1] == 'total':
            fields = fields[:-1]
            
        v = Variable(fields, var['id'])
        all_variables[v.fields] = v
        
        # Link tree structure together
        parent_fields = tuple(fields[:-1])
        if parent_fields in all_variables:
            all_variables[parent_fields].add_child(v)
        
    # Find root
    root_key = (table['description'],)
    if not root_key in all_variables:
        print(all_variables)
        print(table['id'])
    root = all_variables[root_key]
    
    # Find best depth
    max_depth = root.get_max_depth()
    min_depth = root.get_min_depth()
    
    # Try combinations of flattening and filtering to make the data tabular
    valid_depths = np.zeros((max_depth + 1, max_depth + 1))
    for filter_depth in range(max_depth, -1, -1):
        for clip_depth in range(min(min_depth, filter_depth), -1, -1):
            tablevars = list(root.filter_to_depth(filter_depth).deepest(clip_depth))
            if tabular(tablevars):
                valid_depths[filter_depth, clip_depth] = 1
    
    # Get best combination
    max_tabular_depth = np.argmax(np.max(valid_depths, 0) * np.arange(max_depth + 1))
    if max_depth != max_tabular_depth:
        print("Warning: Clipping table %s from %d to %d" % (table['description'], max_depth, max_tabular_depth))
    
    full_table = list(root.deepest())
    
    if not tabular(full_table):
        print("Not tabular")
        print(table['id'])
        print(table['description'])
        print('')
        return None
    
    return root.table()

Here we generate variables, a list of numpy arrays of variables. We have converted the data into a tabular format.

In [8]:
variables = []
for i, t in enumerate(tables):
    table, varlist = t
    tablevars = parse_table(table, varlist)
    if tablevars is None:
        raise Exception('Could not parse table %s' % table['description'])
    else:
        variables.append(tablevars)

In [9]:
 def make_labels(group):
    """ Get a set of labels for each dimension
    
    Parameters
    ----------
    group: list of Variable
    """
    flat = group.flatten()
    return [frozenset(s) for s in zip(*[var.fields[1:] for var in flat])]

Here we generate labels, which is an set of sets of fields that occur together. For example, the labels object for heating fuel will contain the strings:

`'bottled, tank, or lp gas',
'coal or coke',
'electricity',
'fuel oil, kerosene, etc.',
'no fuel used',
'other fuel',
'solar energy',
'utility gas',
'wood'`

In [113]:
labels = set()

for group in variables:
    labels = labels.union(set(make_labels(group)))

# Variable Name Parsing

The goal of this section is to interpret the names of variables, i.e. `built from 1950 to 1959` into useful forms like `('built', 'range', 1950, 1959)`

In [114]:
num_pattern = "(?:(?:[0-9]+(?:\.[0-9]*)?)|(?:\.[0-9]+))"
dollar_pattern = "\$[0-9]+(?:,[0-9][0-9][0-9])*(?:\.[0-9]*)?"

# Generate a dict
patterns = {}

We construct rules for each group of fields. This is not scalable, but it is not that slow either.

* `patterns` is a dictionary from 
    * kev: a group of field labels (`frozenset(str)`) to
    * value: `list` of tuples
        * `str` regex pattern
        * parser function from
            * input: `m`, a regex match object to
            * output: `tuple('field name', 'type', ...)` or list of tuples where `type` is 'category', 'range',
            or 'exact'. 'field name' should be the category that all of the field labels fall into.

In [175]:
def categorical_as_is(name):
    return [
        (
            '.*',
            lambda m: (name, 'category', m.group(0))
        )
    ]

def categorical_replacement(name, replacements):
    def parse(m):
        match = m.group(0)
        for from_patt, to_patt in replacements:
            match = re.sub(from_patt, to_patt, match)
        return (name, 'category', match)
    return [('.*', parse)]
    
def categorical_pairs(name, replacements):
    return [(
           from_patt,
            lambda m: (name, 'category', to_patt)
    ) for from_patt, to_patt in replacements]

def numerical_match(name, range_pattern=None,
                    bottom_pattern=None, exact_pattern=None,
                    top_pattern=None, dtype=int):
    p = []
    if range_pattern:
        p.append((
                range_pattern,
                lambda m: (name, 'range', dtype(m.group(1)), dtype(m.group(2)))
            ))
    if exact_pattern:
        p.append((
                exact_pattern,
                lambda m: (name, 'exact', dtype(m.group(1)))
            ))
    if bottom_pattern:
        p.append((
                bottom_pattern,
                lambda m: (name, 'range', -np.inf, dtype(m.group(1)))
            ))
    if top_pattern:
        p.append((
                top_pattern,
                lambda m: (name, 'range', dtype(m.group(1)), np.inf)
            ))
    return p

In [176]:
patterns[frozenset({
            'high school graduate (including equivalency)', "some college or associate's degree",
            'less than high school graduate', "bachelor's degree or higher"})] = categorical_as_is('education')

In [177]:
patterns[frozenset([
            'under .50', '1.25 to 1.49', '1.50 to 1.84', '.50 to .99',
            '1.85 to 1.99', '2.00 and over', '1.00 to 1.24'])] = \
    numerical_match('poverty to income',
                    range_pattern="(%s) to (%s)" % (num_pattern, num_pattern),
                    bottom_pattern="under (%s)" % (num_pattern,),
                    top_pattern="(%s) and over" % (num_pattern,),
                    dtype=float
                )

In [178]:
patterns[frozenset(['renter occupied',
                    'owner occupied'])] = \
    categorical_replacement('own_rent', [
        ('er .*', ''),
    ])

patterns[frozenset({
            'owner-occupied housing units', 'renter-occupied housing units'
        })] = \
    categorical_replacement('own_rent', [
        ('er-.*', ''),
    ])

In [179]:
patterns[frozenset({'households with no persons with a disability',
                    'households with 1 or more persons with a disability'
                   })] = \
    categorical_pairs('disability', [
        ('with no', 'no'),
        ('with 1 or more', 'yes'),
    ])

In [180]:
def with_without(name):
    return categorical_pairs(name, [('with', 'yes'), ('no', 'no')])

patterns[frozenset(['with earnings', 'no earnings'])] = with_without('earnings')

patterns[frozenset({'no other types of income',
                    'with other types of income'})] = with_without('other income')

patterns[frozenset({'no retirement income',
                    'with retirement income'})] = with_without('retirement income')

patterns[frozenset({'with public assistance income',
                    'no public assistance income'})] = with_without('assistance income')

patterns[frozenset({'no supplemental security income (ssi)',
                    'with supplemental security income (ssi)'})] = with_without('supplemental security income')

patterns[frozenset([
            'with interest, dividends, or net rental income',
            'no interest, dividends, or net rental income'])] = with_without('investment income')

In [181]:
patterns[frozenset(['wood', 'solar energy', 'other fuel', 'electricity',
                    'utility gas', 'bottled, tank, or lp gas', 'coal or coke',
                    'fuel oil, kerosene, etc.', 'no fuel used'])] = \
    categorical_pairs('fuel source', [
            ('solar', 'solar'),
            ('other', 'other'),
            ('utility gas', 'natural gas'),
            ('lp', 'propane'),
            ('coal', 'coal'),
            ('fuel oil', 'fuel oil or kerosene'),
            ('no fuel', 'none')
        ]) + categorical_as_is('fuel source')

In [182]:
patterns[frozenset([
            '5-person household', '2-person household', '4-person household',
            '7-or-more person household', '1-person household',
            '3-person household', '6-person household'])] = \
    numerical_match('household size',
                    exact_pattern="(%s)-person" % (num_pattern,),
                    top_pattern="(%s)-or-more" % (num_pattern,)
                    )

In [183]:
patterns[frozenset(['no extra payment for any utilities', 'pay extra for one or more utilities'])] = \
    categorical_pairs('pay for utility', [
        ('^no', 'none'),
        ('^pay', 'some or all')
    ])

In [184]:
patterns[frozenset({'household did not receive food stamps/snap in the past 12 months',
                    'household received food stamps/snap in the past 12 months'})] = \
    categorical_pairs('food stamps', [
            ('did not', 'no'),
            ('received', 'yes')
        ])

In [185]:
def dollar_to_num(s):
    return int(s.replace('$', '').replace(',', ''))

patterns[frozenset(['$100,000 to $124,999', '$40,000 to $44,999', '$75,000 to $99,999', '$35,000 to $39,999',
                    '$60,000 to $74,999', '$125,000 to $149,999', '$30,000 to $34,999', '$25,000 to $29,999',
                    '$50,000 to $59,999', '$200,000 or more', '$45,000 to $49,999', '$150,000 to $199,999',
                    '$10,000 to $14,999', '$20,000 to $24,999', 'less than $10,000', '$15,000 to $19,999'])] = \
    numerical_match('household income',
                   range_pattern='(%s) to (%s)' % (dollar_pattern, dollar_pattern),
                   top_pattern='(%s) or more' % (dollar_pattern,),
                   bottom_pattern='less than (%s)' % (dollar_pattern,),
                   dtype=dollar_to_num
                   )

In [186]:
patterns[frozenset({'taxicab, motorcycle, or other means', 'bus or trolley bus', 'carpooled', 'subway or elevated',
                    'worked at home', 'bicycle', 'walked', 'in 4-or-more-person carpool',
                    'streetcar or trolley car (carro publico in puerto rico)', 'public transportation (excluding taxicab)',
                    'railroad', 'in 2-person carpool', 'car, truck, or van', 'drove alone', 'in 3-person carpool', 'ferryboat'})] = [
    (
        'worked at home',
        lambda m: [
            ('transportation', 'category', m.group(0)),
            ('work from home', 'category', 'yes'),
        ]
    ),(
        '.*',
        lambda m: [
            ('transportation', 'category', m.group(0)),
            ('work from home', 'category', 'no'),
        ]
    )
]

In [187]:
patterns[frozenset({'50 or more', 'boat, rv, van, etc.',
                    '5 to 9', '20 to 49',
                    'mobile home', '1, attached',
                    '2', '10 to 19', '3 or 4',
                    '1, detached'})] = [
    (
        '(%s) to (%s)' % (num_pattern, num_pattern),
        lambda m: [
            ('num units', 'range', int(m.group(1)), int(m.group(2))),
            ('building type', 'category', 'apartment')
        ]
    ),(
        '(%s) or more' % (num_pattern,),
        lambda m: [
            ('num units', 'range', int(m.group(1)), np.inf),
            ('building type', 'category', 'apartment')
        ]
    ),(
        'mobile',
        lambda m: [
            ('num units', 'exact', 1),
            ('building type', 'category', 'mobile home')
        ]
    ),(
        '1, attached',
        lambda m: [
            ('num units', 'exact', 1),
            ('building type', 'category', 'single family attached')
        ]
    ),(
        '1, detached',
        lambda m: [
            ('num units', 'exact', 1),
            ('building type', 'category', 'single family detached')
        ]
    ),(
        '2',
        lambda m: [
            ('num units', 'exact', 2),
            ('building type', 'category', 'apartment')
        ]
    ),(
        '3 or 4',
        lambda m: [
            ('num units', 'range', 3, 4),
            ('building type', 'category', 'apartment')
        ]
    ),(
        'boat',
        lambda m: [
            ('num units', 'exact', 1),
            ('building type', 'category', 'other')
        ]
    )
]

In [188]:
def small_count(name, suffix):
    return [(
        '^no %s' % (suffix,),
        lambda m: (name, 'exact', 0)
    ),(
        '(%s) %s' % (num_pattern, suffix),
        lambda m: (name, 'exact', int(m.group(1)))
    ),(
        '(%s) or more' % (num_pattern,),
        lambda m: (name, 'range', int(m.group(1)), np.inf)
    )]
    
    
patterns[frozenset({'no bedroom', '2 bedrooms', '3 bedrooms',
                    '1 bedroom', '5 or more bedrooms', '4 bedrooms'})] = small_count('bedrooms', 'bedroom')

patterns[frozenset({
            '5 rooms', '6 rooms', '3 rooms', '4 rooms', '2 rooms',
            '1 room', '7 rooms', '9 or more rooms', '8 rooms'})] = small_count('rooms', 'room')

In [189]:
patterns[frozenset({
            'householder who is black or african american alone', 'householder who is white alone',
            'householder who is asian alone', 'householder who is two or more races',
            'householder who is native hawaiian and other pacific islander alone',
            'householder who is american indian and alaska native alone',
            'householder who is some other race alone'})] = [
    (
        '.* alone',
        lambda m: ('householder race', 'category', m.group(0).replace('householder who is ','').replace(' alone',''))
    ),(
        'two or more',
        lambda m: ('householder race', 'category', 'two or more')
    ),(
        'some other',
        lambda m: ('householder race', 'category', 'other')
    )
]

In [190]:
def year_range(name):
    return numerical_match(
        name,
        range_pattern="(%s) to (%s)" % (num_pattern, num_pattern),
        top_pattern="(%s) or later" % (num_pattern,),
        bottom_pattern="(%s) or earlier" % (num_pattern,),
    )

patterns[frozenset([
            'built 1970 to 1979', 'built 2010 or later', 'built 1980 to 1989',
            'built 1950 to 1959', 'built 1940 to 1949', 'built 1960 to 1969',
            'built 2000 to 2009', 'built 1939 or earlier',
            'built 1990 to 1999'])] = year_range('built')

patterns[frozenset({'moved in 2010 or later', 'moved in 1970 to 1979', 'moved in 1990 to 1999',
                    'moved in 2000 to 2009', 'moved in 1980 to 1989',
                    'moved in 1969 or earlier'})] = year_range('moved in')

In [191]:
patterns[frozenset({'householder 15 to 24 years', 'householder 85 years and over', 'householder 55 to 59 years',
                    'householder 45 to 54 years', 'householder 60 to 64 years', 'householder 75 to 84 years',
                    'householder 25 to 34 years', 'householder 35 to 44 years', 'householder 65 to 74 years'})] = \
    numerical_match(
        'householder age',
        range_pattern="(%s) to (%s)" % (num_pattern, num_pattern),
        top_pattern="(%s) years and over" % (num_pattern,)
    )

# Metadata Generation

First, use the code below to validate the patterns above. It will print any fields or field groups that are not matched.

In [192]:
for group in labels:
    if group in patterns:
        for label in group:
            found = False
            try: 
                for pattern, parser in patterns[group]:
                    m = re.search(pattern, label)
                    if m:
                        print(parser(m))
                        found = True
                        break
            except Exception:
                pass
            if not found:
                print(label)
    else:
        print(group)
print("Done")

('bedrooms', 'exact', 3)
('bedrooms', 'exact', 2)
('bedrooms', 'exact', 4)
('bedrooms', 'range', 5, inf)
('bedrooms', 'exact', 0)
('bedrooms', 'exact', 1)
('supplemental security income', 'category', 'no')
('supplemental security income', 'category', 'no')
('built', 'range', -inf, 1939)
('built', 'range', 2000, 2009)
('built', 'range', 1980, 1989)
('built', 'range', 1960, 1969)
('built', 'range', 1990, 1999)
('built', 'range', 2010, inf)
('built', 'range', 1950, 1959)
('built', 'range', 1940, 1949)
('built', 'range', 1970, 1979)
('food stamps', 'category', 'yes')
('food stamps', 'category', 'yes')
('household income', 'range', -inf, 10000)
('household income', 'range', 100000, 124999)
('household income', 'range', 150000, 199999)
('household income', 'range', 200000, inf)
('household income', 'range', 50000, 59999)
('household income', 'range', 60000, 74999)
('household income', 'range', 45000, 49999)
('household income', 'range', 20000, 24999)
('household income', 'range', 40000, 4499

Now we use these patterns to normalize the variable tables and generate metadata for them.

In [193]:
def normalize_numerical(descriptors):
    """Take in a list of descriptor tuples and output metadata for a single field group.
    
    Parameters
    ----------
    descriptors: list of tuple descriptors describing a single field group.
    """
    # Get all the edges
    edge_tuples = []
    for d in descriptors:
        if d[1] == 'exact':
            edge_tuples.append((d[2], d[2]))
        else:
            edge_tuples.append((d[2], d[3]))
            
    # Sort the edges and return the order used for sorting
    order = [t[1] for t in sorted(zip(edge_tuples, range(len(edge_tuples))))]
    edge_tuples = list(sorted(edge_tuples))
    
    # Generate edges
    edges = [edge_tuples[0][0]]
    for i in range(len(descriptors) - 1):
        # Take maximum on right side of bin, so that bins are left-inclusive right-exclusive
        edges.append(max(edge_tuples[i][1],edge_tuples[i+1][0]))
    edges.append(edge_tuples[i+1][1])
    return {
        'name': descriptors[0][0],
        'edges': edges,
        'labels': None,
        'order': order,
        'type': 'Continuous'
    }

def normalize_categorical(descriptors):
    """Take in a list of descriptor tuples and output metadata for a single field group.
    
    Parameters
    ----------
    descriptors: list of tuple descriptors describing a single field group.
    """
    return {
        'name': descriptors[0][0],
        'edges': None,
        # Extract label from descriptors
        'labels': [d[2] for d in descriptors],
        'order': None,
        'type': 'Categorical'
    }

def normalize_dimension(descriptors):
    """Take in a list of descriptor tuples and output metadata for a single field group.
    
    Parameters
    ----------
    descriptors: list of tuple descriptors describing a single field group.
    """
    # Categorical
    if all(d[1] == 'category' for d in descriptors):
        return normalize_categorical(descriptors)
    # Numerical
    elif all((d[1] in ['exact', 'range']) for d in descriptors):
        return normalize_numerical(descriptors)
    # Mixed
    else:
        print(descriptors)
        raise Exception('Bad descriptors')

def normalize_descriptors(descriptors):
    """Take in a list of list of descriptor tuples, one for each dimension, and output
    metadata for the table.
    
    Parameters
    ----------
    descriptors: list of list of tuple descriptors describing multiple field groups.
    """
    names = []
    edges = []
    labels = []
    order = []
    types = []
    # Extract metadata for each dimension and concatenate
    for dimension in descriptors:
        normalized = normalize_dimension(dimension)
        names.append(normalized['name'])
        edges.append(normalized['edges'])
        labels.append(normalized['labels'])
        types.append(normalized['type'])
        order.append(normalized['order'])
    name = ' by '.join(names)
    return {
        'name': name,
        'names': names,
        'edges': edges,
        'labels': labels,
        'na': ['.', ''],
        'types': types,
        'order': order
    }

parsed_names = set()
parsed_variables = []

for group in variables:
    label_dict = {}
    labels = make_labels(group)
    # Extract descriptors for fields from patterns
    for label_group in labels:
        for label in label_group:
            for pattern, parser in patterns[label_group]:
                m = re.search(pattern, label)
                if m:
                    p = parser(m)
                    if not isinstance(p, list):
                        p = [p]
                    label_dict[label] = p
                    break
       
    first_var = group.flatten()[0]
    d = len(first_var.fields)
    
    # Group descriptors into list structure
    # List of list of descriptor tuples
    descriptors = [list() for i in range(d - 1)]
    for var in group.ravel():
        # For each field dimension
        for i in range(d - 1):
            # Extract the descriptor for the field
            descriptor = label_dict[var.fields[i + 1]]
            # Add to data structure
            if not descriptor in descriptors[i]:
                descriptors[i].append(descriptor)
    
    # In the case of multiple descriptors returned for a single field, take all combinations
    for indices in itertools.product(*[range(len(d[0])) for d in descriptors]):
        # Get only the descriptors for the particular combination described by indices
        sub_descriptors = [list() for i in range(d - 1)]
        for i in range(d - 1):
            for desc in descriptors[i]:
                sub_descriptors[i].append(desc[indices[i]])
        
        # Normalize
        desc = normalize_descriptors(sub_descriptors)
        
        # Apply ordering
        desc_vars = group.copy()
        for i in range(d - 1):
            if desc['order'][i]:
                order = desc['order'][i]
                desc_vars = np.swapaxes(np.swapaxes(desc_vars, 0, i)[order], 0, i)
        del desc['order']
        
        # Convert variables to variable ids and convert to list structure
        desc_vars = np.array(
            [v.id for v in desc_vars.ravel()]
        ).reshape(desc_vars.shape).tolist()
        
        desc['fields'] = desc_vars
        
        # Get descriptions
        tableId = first_var.id.split("_")[0]
        desc['description'] = data['tables'][tableId]['description']
        desc['type'] = 'Histogram'
        
        # Add to global list
        parsed_variables.append(desc)
        if desc['name'] in parsed_names:
            raise Exception('Duplicate Variable: %s' % desc['name'])
        parsed_names.add(desc['name'])

In [194]:
import json

with open('normalized_variables.json', 'w') as f:
    json.dump(parsed_variables, f, sort_keys=True, indent=2)

In [195]:
for i, v in enumerate(parsed_variables):
    print('%s (%d)' % (v['name'], i))

transportation (0)
work from home (1)
household income (2)
earnings (3)
investment income (4)
supplemental security income (5)
assistance income (6)
retirement income (7)
other income (8)
food stamps by disability (9)
own_rent (10)
householder race (11)
own_rent by householder age (12)
own_rent by household size (13)
own_rent by education (14)
own_rent by rooms (15)
num units (16)
building type (17)
own_rent by num units (18)
own_rent by building type (19)
own_rent by built (20)
own_rent by moved in (21)
fuel source (22)
own_rent by bedrooms (23)
pay for utility (24)
poverty to income (25)


In [37]:
parsed_variables[12]

{'description': 'Tenure By Age Of Householder',
 'edges': [None, [15, 25, 35, 45, 55, 60, 65, 75, 85, inf]],
 'fields': [['B25007_003',
   'B25007_004',
   'B25007_005',
   'B25007_006',
   'B25007_007',
   'B25007_008',
   'B25007_009',
   'B25007_010',
   'B25007_011'],
  ['B25007_013',
   'B25007_014',
   'B25007_015',
   'B25007_016',
   'B25007_017',
   'B25007_018',
   'B25007_019',
   'B25007_020',
   'B25007_021']],
 'labels': [['own', 'rent'], None],
 'na': ['.', ''],
 'name': 'own_rent by householder age',
 'names': ['own_rent', 'householder age'],
 'type': 'Histogram',
 'types': ['Categorical', 'Continuous']}

In [38]:
# yes_count = 0
# no_count = 0

# population_patterns = [' for (.*)$', '%\s*(.*?):*\s*$', '.* FOR (.*?)$']

# bins = set()

# for i, table in enumerate(tables):
#     varlist = [data['variables'][variableId] for variableId in table['variables'] if (variabledId in selected_variables)]
#     print(varlist)
#     if len(varlist) == 0:
#         print(table)
#         break
#     descriptions = [var['description'] for var in varlist]
#     population = None
#     for patt in population_patterns:
#         match = re.search(patt, descriptions[0])
#         if match:
#             population = match.group(1)
#     split_length = max(len(normalize_desc(desc).split('%')) for desc in descriptions)
#     deepest = [normalize_desc(desc) for desc in descriptions if len(normalize_desc(desc).split('%')) == split_length]
#     if split_length == 1 and len(deepest) > 1:
#         raise Exception('No detail')
        
#     groups = []
#     for desc in deepest:
#         desc = desc
#         for j, census_bin in enumerate(desc.split('%')[1:]):
#             while len(groups) <= j:
#                 groups.append(set())
#             b = normalize_bin(census_bin)
#             if b == '':
#                 break
#             groups[j].add(b)
#     for group in groups:
#         bins.add(frozenset(group))