In [159]:
import pandas as pd
import numpy as np
from censusdata import censusgeo, download # https://jtleider.github.io/censusdata/
from mlgear.utils import show

state_fips = {
 'AL': '01',
 'AK': '02',
 'AZ': '04',
 'AR': '05',
 'CA': '06',
 'CO': '08',
 'CT': '09',
 'DE': '10',
 'FL': '12',
 'GA': '13',
 'HI': '15',
 'ID': '16',
 'IL': '17',
 'IN': '18',
 'IA': '19',
 'KS': '20',
 'KY': '21',
 'LA': '22',
 'ME': '23',
 'MD': '24',
 'MA': '25',
 'MI': '26',
 'MN': '27',
 'MS': '28',
 'MO': '29',
 'MT': '30',
 'NE': '31',
 'NV': '32',
 'NH': '33',
 'NJ': '34',
 'NM': '35',
 'NY': '36',
 'NC': '37',
 'ND': '38',
 'OH': '39',
 'OK': '40',
 'OR': '41',
 'PA': '42',
 'RI': '44',
 'SC': '45',
 'SD': '46',
 'TN': '47',
 'TX': '48',
 'UT': '49',
 'VT': '50',
 'VA': '51',
 'WA': '53',
 'WV': '54',
 'WI': '55',
 'WY': '56'
}

# Variables from American Community Survey
var = {'total_pop': 'B01001_001E',
       'male': 'B01001_002E',
       'white': 'B01001H_001E',
       'hispanic': 'B01001I_001E',
       'hispanic_mexican': 'C03001_004E',
       'hispanic_puerto_rican': 'C03001_005E',
       'hispanic_cuban': 'C03001_006E',
       'black': 'B01001B_001E',
       'asian': 'B01001D_001E',
       'native_born': 'B05001_002E',
       'earn_75k+': 'B06010_011E',
       'age_18-19': 'B07001_004E',
       'age_20-24': 'B07001_005E',
       'age_25-29': 'B07001_006E',
       'age_30-34': 'B07001_007E',
       'age_35-39': 'B07001_008E',
       'age_40-44': 'B07001_009E',
       'age_45-49': 'B07001_010E',
       'age_50-54': 'B07001_011E',
       'age_50-59': 'B07001_012E',
       'age_60-64': 'B07001_013E',
       'age_65-69': 'B07001_014E',
       'age_70-74': 'B07001_015E',
       'age_75+': 'B07001_016E',
       'age_25+_less_hs': 'B07009_002E',
       'age_25+_hs_grad': 'B07009_003E',
       'age_25+_some_college': 'B07009_004E',
       'age_25+_college': 'B07009_005E',
       'age_25+_grad_degree': 'B07009_006E'}

# rural
# more income?
# eudcation x race
# religion
# urbanization
# separate: 2016 vote, 2020 vote

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maine-1': 'M1',
    'Maine-2': 'M2',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nebraska-1': 'N1',
    'Nebraska-2': 'N2',
    'Nebraska-3': 'N3',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

def state_to_abbrev(state):
    return us_state_abbrev.get(state.title(), state.title())

def abbrev_to_state(abbrev):
    return abbrev_us_state.get(abbrev.upper(), abbrev.upper())


In [160]:
len(state_fips.items())

50

In [161]:
dfs = None

years = range(2009, 2019)
states = {'AL': '01'}.items() # state_fips.items()
use_vars = list(var.keys())

i = 1
for state_abbrev, fips_num in states:
    state_geo = censusgeo([('state', fips_num)])
    j = 1
    for y in years:
        print('... {} {} ({}/{} {}%)'.format(abbrev_to_state(state_abbrev),
                                             y + 1,
                                             (i - 1) * len(years) + j,
                                             len(years) * len(states),
                                             np.round((((i - 1) * len(years) + j) / (len(years) * len(states))) * 100, 1)))
        df = download('acs1', y + 1, state_geo, [var[v] for v in use_vars]).reset_index(drop=True)
        df.columns = use_vars
        df['year'] = y
        df['state'] = abbrev_to_state(state_abbrev)
        df = df[['year', 'state'] + use_vars]
        if dfs is None:
            dfs = df
        else:
            dfs = pd.concat((dfs, df)).reset_index(drop=True)
        j += 1
    i += 1
    
show(dfs)

... Alabama 2010 (1/10 10.0%)
... Alabama 2011 (2/10 20.0%)
... Alabama 2012 (3/10 30.0%)
... Alabama 2013 (4/10 40.0%)
... Alabama 2014 (5/10 50.0%)
... Alabama 2015 (6/10 60.0%)
... Alabama 2016 (7/10 70.0%)
... Alabama 2017 (8/10 80.0%)
... Alabama 2018 (9/10 90.0%)
... Alabama 2019 (10/10 100.0%)
   year    state  total_pop     male    white  hispanic  hispanic_mexican  \
0  2009  Alabama    4785298  2322243  3206485    182795            127157   
1  2010  Alabama    4802740  2324413  3202563    186204            125912   
2  2011  Alabama    4822023  2336291  3212468    185441            128836   
3  2012  Alabama    4833722  2341869  3204524    189934            123415   
4  2013  Alabama    4849377  2347969  3205535    192413            115753   
5  2014  Alabama    4858979  2352810  3204076    192870            126295   
6  2015  Alabama    4863300  2356052  3197147    199686            126577   
7  2016  Alabama    4874747  2359896  3191450    201970            117518   
8  20