In [1]:
from census_utils import *
import pandas as pd
import numpy as np

Load original Census data

In [2]:
block_df = pd.read_csv(get_block_out_file())

In [3]:
synth_df = pd.read_csv(get_synthetic_out_file())

In [16]:
synth_df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,HH_NUM,ACCURACY,AGE_ACCURACY,identifier
0,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,2,0,2,0,1,True,19601001000
1,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,1,1,1,True,19601001000
2,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,1,2,1,True,19601001000
3,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,1,3,1,True,19601001000
4,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,1,4,1,True,19601001000


Aggregate the household-level synthetic data to the household level

In [6]:
def aggregate_to_block(df):
    demo_cols = ['TOTAL', 'W', 'B', 'AI_AN', 'AS', 'H_PI', 'OTH', 'TWO_OR_MORE', 'NUM_HISP', '18_PLUS']
    agg_dict = {col : np.sum for col in demo_cols}
    agg_dict['AGE_ACCURACY'] = np.mean
    agg_dict['ACCURACY'] = np.mean
    agg_df = df.groupby('identifier').agg(agg_dict)
    return agg_df

In [7]:
agg_df = aggregate_to_block(synth_df)

In [15]:
agg_df.head()

Unnamed: 0_level_0,TOTAL,W,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
71001001,66,59,7,0,0,0,0,0,4,51,1.0,2.0
71001002,8,6,0,0,0,0,0,2,0,4,1.0,1.0
71001004,36,36,0,0,0,0,0,0,4,26,1.0,1.0
71001005,35,27,8,0,0,0,0,0,0,28,1.0,2.0
71001008,720,605,23,3,61,0,5,23,21,558,1.0,2.0


Merge with block data

In [9]:
def combine_dfs(block_df, agg_df):
    df = block_df.merge(agg_df,
            how='inner',
            on='identifier',
            validate='one_to_one')
    return df

In [14]:
df = combine_dfs(block_df, agg_df)
df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
0,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,3,1,101,1.0,1.0
1,2010,Vermont,50,Addison County,1,70075,960100,1,1002,Block 1002,...,0,0,0,0,0,2,2,79,1.0,1.0
2,2010,Vermont,50,Addison County,1,70075,960100,1,1005,Block 1005,...,0,0,0,0,0,0,0,7,1.0,1.0
3,2010,Vermont,50,Addison County,1,70075,960100,1,1006,Block 1006,...,0,0,0,0,0,1,0,38,1.0,1.0
4,2010,Vermont,50,Addison County,1,70075,960100,1,1009,Block 1009,...,0,0,0,0,0,0,3,38,1.0,1.0


Get rid of columns we don't need

In [11]:
def simplify_df(df):
    cols = df.columns
    to_drop = [c for c in cols if c.startswith('IA')]
    to_drop += [c for c in cols if c.startswith('H9')]
    to_drop += [c for c in cols if c.startswith('H8M')]
    df.drop(columns=to_drop, inplace=True)

In [12]:
simplify_df(df)

In [13]:
df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
0,2010,Vermont,50,Addison County,1,70075,960100,1,1000,Block 1000,...,0,0,0,0,0,3,1,101,1.0,1.0
1,2010,Vermont,50,Addison County,1,70075,960100,1,1002,Block 1002,...,0,0,0,0,0,2,2,79,1.0,1.0
2,2010,Vermont,50,Addison County,1,70075,960100,1,1005,Block 1005,...,0,0,0,0,0,0,0,7,1.0,1.0
3,2010,Vermont,50,Addison County,1,70075,960100,1,1006,Block 1006,...,0,0,0,0,0,1,0,38,1.0,1.0
4,2010,Vermont,50,Addison County,1,70075,960100,1,1009,Block 1009,...,0,0,0,0,0,0,3,38,1.0,1.0


Check that the total population matches

In [22]:
def check_total(df):
    print('Original total population:', df['H7X001'].sum())
    print('Synthetic total population:', df['TOTAL'].sum())

In [23]:
check_total(df)

Original total population: 625741
Synthetic total population: 625741


Check that voting age population matches

In [24]:
def check_18_plus(df):
    print('Original voting age population:', df['H8A003'].sum())
    print('Synthetic voting age population:', df['18_PLUS'].sum())

In [25]:
check_18_plus(df)

Original voting age population: 471768
Synthetic voting age population: 489376


These are off by almost 20,000. Where is this coming from?

In [41]:
def check_18_plus_again(df):
    df_age = df[df['AGE_ACCURACY'] == 1]
    df_no_age = df[df['AGE_ACCURACY'] != 1]
    print('With accurate age totals (original, synthetic):', df_age['H8A003'].sum(), df_age['18_PLUS'].sum())
    print('With bad age totals (original, synthetic):', df_no_age['H8A003'].sum(), df_no_age['18_PLUS'].sum())
    print('Accurate age adult fraction.\nOriginal: {:.3f}\tSynthetic: {:.3f}'.format(df_age['H8A003'].sum()/df_age['H7X001'].sum(), df_age['18_PLUS'].sum()/df_age['TOTAL'].sum()))
    print('Without accurate age adult fraction.\nOriginal: {:.3f}\tSynthetic: {:.3f}'.format(df_no_age['H8A003'].sum()/df_no_age['H7X001'].sum(), df_no_age['18_PLUS'].sum()/df_no_age['TOTAL'].sum()))

In [42]:
check_18_plus_again(df)

With accurate age totals (original, synthetic): 445513 445513
With bad age totals (original, synthetic): 26255 43863
Accurate age adult fraction.
Original: 0.783	Synthetic: 0.783
Without accurate age adult fraction.
Original: 0.461	Synthetic: 0.770


As expected, when blocks have accurate age information, we exactly match the voting age population. When they don't, we're off by quite a lot. But something weird is going on with these blocks: when we have accurate age information (no inconsistencies in original data), about 78% of the population is voting age, which matches US population statistics. But when we don't have accurate age information, this drops to 46% in the original data and is about 77% in our synthetic data. This suggests there are systematic errors in the original data, and our reconstruction is actually a better approximation to the truth.

Next, let's check the race and ethnicity counts

In [46]:
def check_race_counts(df):
    r_eth_map = {
        'H7X002': 'W',
        'H7X003': 'B',
        'H7X004': 'AI_AN',
        'H7X005': 'AS',
        'H7X006': 'H_PI',
        'H7X007': 'OTH',
        'H7X008': 'TWO_OR_MORE',
        'H7Z010': 'NUM_HISP',
    }
    for c1, c2 in r_eth_map.items():
        print(c2, np.abs(df[c1] - df[c2]).sum())

In [47]:
check_race_counts(df)

W 0
B 0
AI_AN 0
AS 0
H_PI 0
OTH 0
TWO_OR_MORE 0
NUM_HISP 0


As expected, these match exactly since they're held invariant by our solver