# Processing the data

In [1]:
from census_utils import *
import pandas as pd
import numpy as np

Load original Census data

In [2]:
block_df = pd.read_csv(get_block_out_file())

In [3]:
synth_df = pd.read_csv(get_synthetic_out_file())

In [4]:
synth_df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,HH_NUM,ACCURACY,AGE_ACCURACY,identifier
0,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,0,0,0,4,0,3,0,2,False,001-400100-1007
1,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,0,0,2,0,0,2,1,2,False,001-400100-1007
2,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,0,0,2,0,2,1,2,2,False,001-400100-1007
3,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,4,0,0,0,0,4,3,2,False,001-400100-1007
4,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,5,0,0,0,0,4,4,2,False,001-400100-1007


Aggregate the household-level synthetic data to the household level

In [5]:
def aggregate_to_block(df):
    demo_cols = ['TOTAL', 'W', 'B', 'AI_AN', 'AS', 'H_PI', 'OTH', 'TWO_OR_MORE', 'NUM_HISP', '18_PLUS']
    agg_dict = {col : np.sum for col in demo_cols}
    agg_dict['AGE_ACCURACY'] = np.mean
    agg_dict['ACCURACY'] = np.mean
    agg_df = df.groupby('identifier').agg(agg_dict)
    return agg_df

In [6]:
agg_df = aggregate_to_block(synth_df)

In [7]:
agg_df.head()

Unnamed: 0_level_0,TOTAL,W,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
001-400100-1007,117,100,0,0,9,0,4,4,6,109,0.0,2.0
001-400100-1008,20,16,0,0,2,0,2,0,0,20,1.0,1.0
001-400100-1011,13,12,0,0,1,0,0,0,1,12,1.0,1.0
001-400100-1013,1,1,0,0,0,0,0,0,0,1,1.0,1.0
001-400100-1014,20,18,0,0,2,0,0,0,2,18,1.0,1.0


Merge with block data

In [8]:
def combine_dfs(block_df, agg_df):
    df = block_df.merge(agg_df,
            how='inner',
            on='identifier',
            validate='one_to_one')
    return df

In [9]:
df = combine_dfs(block_df, agg_df)
df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
0,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,0,0,9,0,4,4,6,109,0.0,2.0
1,2010,California,6,Alameda County,1,92230,400100,1,1008,Block 1008,...,0,0,2,0,2,0,0,20,1.0,1.0
2,2010,California,6,Alameda County,1,92230,400100,1,1011,Block 1011,...,0,0,1,0,0,0,1,12,1.0,1.0
3,2010,California,6,Alameda County,1,92230,400100,1,1013,Block 1013,...,0,0,0,0,0,0,0,1,1.0,1.0
4,2010,California,6,Alameda County,1,92230,400100,1,1014,Block 1014,...,0,0,2,0,0,0,2,18,1.0,1.0


Get rid of columns we don't need

In [10]:
def simplify_df(df):
    cols = df.columns
    to_drop = [c for c in cols if c.startswith('IA')]
    to_drop += [c for c in cols if c.startswith('H9')]
    to_drop += [c for c in cols if c.startswith('H8M')]
    df.drop(columns=to_drop, inplace=True)

In [11]:
simplify_df(df)

In [12]:
df.head()

Unnamed: 0,YEAR,STATE,STATEA,COUNTY,COUNTYA,COUSUBA,TRACTA,BLKGRPA,BLOCKA,NAME,...,B,AI_AN,AS,H_PI,OTH,TWO_OR_MORE,NUM_HISP,18_PLUS,AGE_ACCURACY,ACCURACY
0,2010,California,6,Alameda County,1,92230,400100,1,1007,Block 1007,...,0,0,9,0,4,4,6,109,0.0,2.0
1,2010,California,6,Alameda County,1,92230,400100,1,1008,Block 1008,...,0,0,2,0,2,0,0,20,1.0,1.0
2,2010,California,6,Alameda County,1,92230,400100,1,1011,Block 1011,...,0,0,1,0,0,0,1,12,1.0,1.0
3,2010,California,6,Alameda County,1,92230,400100,1,1013,Block 1013,...,0,0,0,0,0,0,0,1,1.0,1.0
4,2010,California,6,Alameda County,1,92230,400100,1,1014,Block 1014,...,0,0,2,0,0,0,2,18,1.0,1.0


# Analysis
## Total population
Check that the total population matches

In [13]:
def check_total(df):
    print('Original total population:', df['H7X001'].sum())
    print('Synthetic total population:', df['TOTAL'].sum())

In [14]:
check_total(df)

Original total population: 37253956
Synthetic total population: 37253956


## Voting age population
Check that voting age population matches

In [15]:
def check_18_plus(df):
    print('Original voting age population:', df['H8A003'].sum())
    print('Synthetic voting age population:', df['18_PLUS'].sum())

In [16]:
check_18_plus(df)

Original voting age population: 27170431
Synthetic voting age population: 27528877


These are off by almost 20,000 (VT) and 100,000 (GA). Where is this coming from?

In [17]:
def check_18_plus_again(df):
    df_age = df[df['AGE_ACCURACY'] == 1]
    df_no_age = df[df['AGE_ACCURACY'] != 1]
    print('With accurate age totals (original, synthetic):', df_age['H8A003'].sum(), df_age['18_PLUS'].sum())
    print('With bad age totals (original, synthetic):', df_no_age['H8A003'].sum(), df_no_age['18_PLUS'].sum())
    print('Accurate age adult fraction.\nOriginal: {:.3f}\tSynthetic: {:.3f}'.format(df_age['H8A003'].sum()/df_age['H7X001'].sum(), df_age['18_PLUS'].sum()/df_age['TOTAL'].sum()))
    print('Without accurate age adult fraction.\nOriginal: {:.3f}\tSynthetic: {:.3f}'.format(df_no_age['H8A003'].sum()/df_no_age['H7X001'].sum(), df_no_age['18_PLUS'].sum()/df_no_age['TOTAL'].sum()))
    print('Fraction of people who live in blocks with bad age information', df_no_age['H7X001'].sum() / df['H7X001'].sum())
    print('Estimated fraction undercount', (df['18_PLUS'] - df['H8A003']).sum() / df['H7X001'].sum())
    print(df['H8A003'].sum())

In [18]:
check_18_plus_again(df)

With accurate age totals (original, synthetic): 24845060 24845060
With bad age totals (original, synthetic): 2325371 2683817
Accurate age adult fraction.
Original: 0.744	Synthetic: 0.744
Without accurate age adult fraction.
Original: 0.599	Synthetic: 0.692
Fraction of people who live in blocks with bad age information 0.10413836318483868
Estimated fraction undercount 0.009621689573048296
27170431


VT: As expected, when blocks have accurate age information, we exactly match the voting age population. When they don't, we're off by quite a lot. But something weird is going on with these blocks: when we have accurate age information (no inconsistencies in original data), about 78% of the population is voting age, which matches US population statistics. But when we don't have accurate age information, this drops to 46% in the original data and is about 77% in our synthetic data. This suggests there are systematic errors in the original data, and our reconstruction is actually a better approximation to the truth.

CA, GA: Similar

## Race and ethnicity
Next, let's check the race and ethnicity counts

In [19]:
def check_race_counts(df):
    r_eth_map = {
        'H7X002': 'W',
        'H7X003': 'B',
        'H7X004': 'AI_AN',
        'H7X005': 'AS',
        'H7X006': 'H_PI',
        'H7X007': 'OTH',
        'H7X008': 'TWO_OR_MORE',
        'H7Z010': 'NUM_HISP',
    }
    for c1, c2 in r_eth_map.items():
        print(c2, np.abs(df[c1] - df[c2]).sum())

In [20]:
check_race_counts(df)

W 0
B 0
AI_AN 0
AS 0
H_PI 0
OTH 0
TWO_OR_MORE 0
NUM_HISP 0


As expected, these match exactly since they're held invariant by our solver