# American Community Survey (ACS)

In [1]:
from probplots import plot_dists_pps

from collections import OrderedDict
from imp import reload
import pdb
import random

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import SVG, display

from hud_geo_conversions import read_zips_to_fips
from map_maker import draw_county_data_svg
from utilities import (display_cb, is_outlier_instance, is_outlier_val,
                       boxcox_standardize, read_fips_codes)
from probplots import ProbPlots, plot_dists_pps



from ajpp import read_ajpp_geo, read_ajpp_pop
import jdata
from jewish_county_data import read_jdata_counties, clean_jdata_county
# from acs import read_acs_israeli_ancestry
from religion_census import read_all_denoms, read_cb, read_judaic_denoms




pd.set_option('display.float_format', lambda x: '%.3f' % x)
from sklearn.decomposition import PCA

%matplotlib inline

SEED = 42
random.seed(SEED)

In [2]:
DATA_DIR    = '../Data/'
IMG_DIR = '../Images/'
DEM_DIR = ''.join([DATA_DIR, 'Demography/'])



POP_FP           = ''.join([DEM_DIR, 'ACS_15_5YR_B01003/',
                                         'ACS_15_5YR_B01003_with_ann.csv'])
ACS_FOREIGN_BIRTH_FP = ''.join([DEM_DIR, 'ACS_15_5YR_B05006/', 
                                         'ACS_15_5YR_B05006_with_ann.csv'])
SNGL_ANCE_FP = ''.join([DEM_DIR, 'ACS_15_5YR_B04004/',
                                         'ACS_15_5YR_B04004_with_ann.csv'])
MULT_ANCE_FP = ''.join([DEM_DIR, 'ACS_15_5YR_B04005/',
                                         'ACS_15_5YR_B04005_with_ann.csv'])
ALL_ANCE_FP      = ''.join([DEM_DIR, 'ACS_15_5YR_B04006/',
                                         'ACS_15_5YR_B04006_with_ann.csv'])

fps = [POP_FP, ACS_FOREIGN_BIRTH_FP, SNGL_ANCE_FP, 
       MULT_ANCE_FP, ALL_ANCE_FP]

COLORED_FP = ''.join([IMG_DIR, 'temp.svg'])  # for playing around

In [36]:
import pandas as pd
from utilities import split_state, state_to_abbr, code_to_str

class ACSReader():
    
    # _OG indicates the column name in original table
    FIPS_COL_OG, FIPS_COL = 'Id2', 'FIPS'
    GEO_COL_OG,  GEO_COL  =  'Geography', 'County'
    
    EST_PRE, MOE_PRE = 'Estimate; ', 'Margin of Error; '
    TOT_EST_OG, TOT_EST = EST_PRE+'Total', 'Tot'
    
    NEW_MOE_SUFFIX = '_Moe'
    TOT_MOE_OG, TOT_MOE = MOE_PRE+'Total', 'Tot'+NEW_MOE_SUFFIX
    
    
    # general columns, i.e. not specific var from kw
    GEN_COLS_OG = [FIPS_COL_OG, GEO_COL_OG, TOT_EST_OG, TOT_MOE_OG]
    GEN_COLS    = [FIPS_COL, GEO_COL, TOT_EST, TOT_MOE]
    
    TO_DROP = ['Id']  # never necessary
    
    def __init__(self, fp):
        
        self.fp = fp
        
    def read_counties(self, kw=None, name=None, total=True, moe=False):
        """Reads ACS county-level demographic data."""

        df = pd.read_csv(self.fp, skiprows=1, encoding='latin')
        df = df.drop(self.TO_DROP, axis=1)
        
        # inconsistency across tables with total ending in ':'
        df = df.rename(columns=lambda x: x.strip(':'))
        
        df = df.rename(columns=dict(zip(self.GEN_COLS_OG, self.GEN_COLS)))

        if kw:
            kw_cols = [x for x in df.columns if kw in x]

            df = df.select(lambda x: x in self.GEN_COLS or x in kw_cols)

            if name is not None:
                # use kw arg value as new col name
                name = kw if name is True else name

                # prevents multiple kw matches from having the
                # same name, i.e. ambiguous, but also allowing
                # for when kw not found in columns
                if len(kw_cols) > 2:
                    raise ValueError('Renaming only supported when '
                                    'keyword present in only one variable')
                new_kw_cols = []
                for kw_col in kw_cols:
                    if self.EST_PRE in kw_col:
                        new_kw_cols.append(name)
                    elif self.MOE_PRE in kw_col:
                        new_kw_cols.append(name+self.NEW_MOE_SUFFIX)
                    else:
                        new_kw_cols.append(kw_col)

                df = df.rename(columns=dict(zip(kw_cols, new_kw_cols)))

        if not moe:
            df = df.select(
                lambda x: not x.endswith(self.NEW_MOE_SUFFIX), axis=1
            )
        if not total:
            df = df.select(
                lambda x: x not in [self.TOT_EST, self.TOT_MOE], axis=1
            )
        
        df[self.FIPS_COL] = code_to_str(df[self.FIPS_COL], 5)
        df = df.set_index(self.FIPS_COL)
        df = split_state(df, 'County')
        df.State = state_to_abbr(df.State)
            
        return df


In [41]:
POP_FP               = ''.join([DEM_DIR, 'ACS_15_5YR_B01003/',
                                'ACS_15_5YR_B01003_with_ann.csv'])
ACS_FOREIGN_BIRTH_FP = ''.join([DEM_DIR, 'ACS_15_5YR_B05006/', 
                                'ACS_15_5YR_B05006_with_ann.csv'])
SNGL_ANCE_FP         = ''.join([DEM_DIR, 'ACS_15_5YR_B04004/',
                                'ACS_15_5YR_B04004_with_ann.csv'])
MULT_ANCE_FP         = ''.join([DEM_DIR, 'ACS_15_5YR_B04005/',
                                'ACS_15_5YR_B04005_with_ann.csv'])
ALL_ANCE_FP          = ''.join([DEM_DIR, 'ACS_15_5YR_B04006/',
                                'ACS_15_5YR_B04006_with_ann.csv'])

acs_reader = ACSCountyReader(POP_FP)
acs_pop = acs_reader.read_counties()

In [43]:
from test_acs import test_acs

In [46]:
!python test_acs.py

Basic tests passed: no exceptions raised


In [39]:
import itertools as it

def test_acs(data_dir):
    
    POP_FP               = ''.join([data_dir, 'ACS_15_5YR_B01003/',
                                    'ACS_15_5YR_B01003_with_ann.csv'])
    ACS_FOREIGN_BIRTH_FP = ''.join([data_dir, 'ACS_15_5YR_B05006/', 
                                    'ACS_15_5YR_B05006_with_ann.csv'])
    SNGL_ANCE_FP         = ''.join([data_dir, 'ACS_15_5YR_B04004/',
                                    'ACS_15_5YR_B04004_with_ann.csv'])
    MULT_ANCE_FP         = ''.join([data_dir, 'ACS_15_5YR_B04005/',
                                    'ACS_15_5YR_B04005_with_ann.csv'])
    ALL_ANCE_FP          = ''.join([data_dir, 'ACS_15_5YR_B04006/',
                                    'ACS_15_5YR_B04006_with_ann.csv'])

    fps = [POP_FP, ACS_FOREIGN_BIRTH_FP, SNGL_ANCE_FP, 
           MULT_ANCE_FP, ALL_ANCE_FP]

    kws = [False, 'test_kw']
    names = [None, True, 'test_names']
    drop_tot = [False, True]
    drop_moe = [False, True]

    
    for fp in fps:
        acs = ACSReader(fp)
        for args in it.product(kws, names, drop_tot, drop_moe):
            acs.read_counties(*args)
            
    print('Basic tests passed: no exceptions raised')
            
test_acs(DEM_DIR)

Tests passed: no exceptions raised
