# This notebook will get census data from the API. 

### Steps: 


1. Define census variables to be retrieved
2. Format request for API
3. Get request
4. Format returned data and save as csv or json




In [40]:
import pandas as pd, numpy as np
import os
from pprint import pprint
import requests
from timeit import default_timer as timer

In [41]:
path='/Users/lisarayle/Dropbox/sf_data/census'
os.chdir(path)

## Define census variables

In [42]:
# I will use years 1990, 2000, 2010, 2005-2009, 2011-2014
# TODO: might have to add ACS 2008-2012 in order to get housing unit variables for 2010. 
# I prepared the variables I want in an excel file. 
# note age variables are separate; can add them later if needed. 
filename='variable_codes.xlsx'
vars_df = pd.read_excel(filename)
vars_df

Unnamed: 0,name,description,1990_sf1,1990_sf3,2000_sf1,2000_sf3,2010_sf1,acs5,2012_acs5,use_in_final
0,tot_pop_occ_units,total population in occupied housing units,,,H010001,,H0100001,B25026_001E,,
1,tot_hu,total housing units,H0010001,,H001001,,H00010001,B25034_001E,,yes
2,avg_hh_size,average household size for occupied housing units,,,H012001,,H0120001,B25010_001E,,
3,occ_hu,occupation status - occupied,H0020001,,H003002,,H0030002,B25002_002E,,yes
4,mortgage,tenure - owned with mortgage,,,,H080002,H0040002,B25027_002E,,
5,no_mort,tenure - owned free and clear,,,,H080008,H0040003,B25027_010E,,
6,owned,owner-occupied housing units,H0030001,,H004002,,,B25036_002E,B25036_002E,yes
7,rented,renter-occupied housing units,H0030002,,H004003,,H0040004,B25036_012E,,yes
8,hu_detatched,HUs 1 detached unit/struct,H0410001,,,H030002,,B25024_002E,B25024_002E,yes
9,hu_attached,HUs 1 attached unit/struct,H0410002,,,H030003,,B25024_003E,B25024_003E,


In [43]:
var_names = list(vars_df.name)

Make a nice dictionary to hold the variables names and definitions. 
The structure will be this: 

data = year > dataset > [var name : var code]

In [44]:
years = ['1990','2000','2010','2009','2012','2014']
my_vars = dict.fromkeys(years)
my_vars

{'1990': None,
 '2000': None,
 '2009': None,
 '2010': None,
 '2012': None,
 '2014': None}

In [45]:
# notice the datasets are different for each year so it's not really possible to make this shorter.
for yr in ['1990','2000']:
    my_vars[yr] = dict.fromkeys(['sf1','sf3'])
    for sf in my_vars[yr].keys():
        code_list = list(vars_df[pd.notnull(vars_df[yr+'_'+sf])][yr+'_'+sf])
        name_list = list(vars_df[pd.notnull(vars_df[yr+'_'+sf])]['name'])
        my_vars[yr][sf]=dict(zip(name_list, code_list))

yr='2010'
sf='sf1'
code_list = list(vars_df[pd.notnull(vars_df[yr+'_'+sf])][yr+'_'+sf])
name_list = list(vars_df[pd.notnull(vars_df[yr+'_'+sf])]['name'])
my_vars[yr]={sf:dict(zip(name_list, code_list))}

for yr in ['2009','2014']:
    ds='acs5'
    code_list = list(vars_df[pd.notnull(vars_df[ds])][ds])
    name_list = list(vars_df[pd.notnull(vars_df[ds])]['name'])
    my_vars[yr]={ds:dict(zip(name_list, code_list))}
    

In [46]:
# test cell
pprint(my_vars['2014'])

{'acs5': {'asian': 'B03002_006E',
          'avg_hh_size': 'B25010_001E',
          'black': 'B03002_004E',
          'families': 'B25024_003E',
          'foreign_born': 'B05002_013E',
          'hispanic': 'B03002_012E',
          'hu_10-19': 'B25024_007E',
          'hu_2': 'B25024_004E',
          'hu_20-49': 'B25024_008E',
          'hu_3-4': 'B25024_005E',
          'hu_5-9': 'B25024_006E',
          'hu_50': 'B25024_009E',
          'hu_attached': 'B25024_003E',
          'hu_detatched': 'B25024_002E',
          'med_age': 'B01002_001E',
          'med_inc': 'B19013_001E',
          'med_value': 'B25077_001E',
          'med_yr_built': 'B25035_001E',
          'med_yr_moved_all': 'B25039_001E',
          'med_yr_moved_owner': 'B25039_002E',
          'mortgage': 'B25027_002E',
          'no_mort': 'B25027_010E',
          'occ_hu': 'B25002_002E',
          'owned': 'B25036_002E',
          'rented': 'B25036_012E',
          'same_house_1': 'B07001_017E',
          'tot_hhs': 'B1

In [47]:
# year 2012 is weird, only need the housing unit variables, which are contained in ACS. Everything else is in the census.
yr='2012'
ds='2012_acs5'
code_list = list(vars_df[pd.notnull(vars_df[ds])][ds])
name_list = list(vars_df[pd.notnull(vars_df[ds])]['name'])
my_vars[yr]={ds:dict(zip(name_list, code_list))}
pprint(my_vars['2012'])

{'2012_acs5': {'foreign_born': 'B05002_013E',
               'hu_10-19': 'B25024_007E',
               'hu_2': 'B25024_004E',
               'hu_20-49': 'B25024_008E',
               'hu_3-4': 'B25024_005E',
               'hu_5-9': 'B25024_006E',
               'hu_50': 'B25024_009E',
               'hu_attached': 'B25024_003E',
               'hu_detatched': 'B25024_002E',
               'med_inc': 'B19013_001E',
               'med_value': 'B25077_001E',
               'med_yr_built': 'B25035_001E',
               'med_yr_moved_all': 'B25039_001E',
               'med_yr_moved_owner': 'B25039_002E',
               'owned': 'B25036_002E',
               'same_house_1': 'B07001_017E'}}


## Define geographies

This will use the block group level.

I made a list of all the block groups in SF. 


In [48]:
tr00 = pd.read_csv('DEC_00_SF1_sftracts.csv', dtype=str)
tr10 = pd.read_csv('DEC_10_SF1_sftracts.csv', dtype=str)
tr14 = pd.read_csv('ACS_14_5YR_sftracts.csv',dtype=str)
tr14.head()


Unnamed: 0,GEO.id2,GEO.display-label
0,6075010100,"Census Tract 101, San Francisco County, Califo..."
1,6075010200,"Census Tract 102, San Francisco County, Califo..."
2,6075010300,"Census Tract 103, San Francisco County, Califo..."
3,6075010400,"Census Tract 104, San Francisco County, Califo..."
4,6075010500,"Census Tract 105, San Francisco County, Califo..."


In [49]:
# what we need for the requests is a list of tracts (won't use bg's until later. )


tracts00 = [s[4:] for s in (list(tr00['GEO.id2']))]
tracts10 = [s[4:] for s in (list(tr10['GEO.id2']))]
tracts14 = [s[4:] for s in (list(tr14['GEO.id2']))]
tracts90 = []  # need to add 1990 bgs later. 


tracts_by_year = {'1990':tracts90, '2000':tracts00, '2010':tracts10, '2009':tracts00, '2014':tracts14,'2012':tracts14}

#tracts_by_year['2010']


## Get data

In [50]:
# Census 2010 example
#http://api.census.gov/data/2010/sf1?get=P0010001,P0030001&for=block+group:1&in=state:02+county:170+tract:000101&key=YOUR_KEY_GOES_HERE
# ACS example
#http://api.census.gov/data/2014/acs5?get=NAME,B01001_001E&for=block+group:0&in=state:06+county:061+tract:990000&key=YOUR_KEY_GOES_HERE


In [51]:

def make_var_string(year='2010', dataset='sf1',var_dict=my_vars):
    var_list = list(var_dict[year][dataset].values())
    v_str = ''
    for var in var_list: 
        v_str= v_str+var+','
    return(v_str.rstrip(','))

var_string = make_var_string(year='2009', dataset='acs5')
var_string

'B25024_003E,B03002_006E,B25039_001E,B03002_012E,B25027_002E,B19013_001E,B01002_001E,B25024_008E,B25035_001E,B25026_001E,B25024_007E,B03002_003E,B05002_013E,B25024_005E,B25077_001E,B07001_017E,B25034_001E,B25002_002E,B25024_009E,B25024_003E,B03002_004E,B25024_004E,B25039_002E,B25024_002E,B25036_012E,B11001_001E,B25010_001E,B01001_001E,B25027_010E,B25036_002E,B25024_006E'

In [52]:
# prepare parameters
key = '77393a88b95432a421e17611c44b7b24fee721cf'

#var_string = 'P0010001,P0030001'  # an example to test
tract = '600200'  # just an example

def make_url(year, dataset,bg='*',state='06',county='075',tract=tract,key=key):

    var_string = make_var_string(year, dataset)
    if year=='2012':
        dataset='acs5'
    geog='state:{}+county:{}+tract:{}'.format(state,county,tract)
    #params = {'for':'*','in':geog,'key':key}
    url = 'http://api.census.gov/data/{yr}/{ds}?get={v}&for=block+group:{b}&in={geo}&key={k}'.format(yr=year, ds=dataset,v=var_string,b=bg, geo=geog,k=key)
    return url

url = make_url(year='2012',dataset='2012_acs5',tract=tract)
url

#test
#http://api.census.gov/data/2009/acs5?get=B01001H_001E&for=block+group:*&in=state:06+county:075+tract:600200&key=77393a88b95432a421e17611c44b7b24fee721cf



'http://api.census.gov/data/2012/acs5?get=B25077_001E,B25024_003E,B07001_017E,B25024_008E,B25024_009E,B19013_001E,B25035_001E,B25024_004E,B25039_002E,B25024_002E,B25024_005E,B25039_001E,B25036_002E,B25024_007E,B05002_013E,B25024_006E&for=block+group:*&in=state:06+county:075+tract:600200&key=77393a88b95432a421e17611c44b7b24fee721cf'

In [53]:
# time the request
start = timer()
url='http://api.census.gov/data/2014/acs5?get=B25036_002E,B25024_003E,B05002_013E,B03002_012E,B25035_001E,B25024_004E,B03002_004E,B25039_002E,B25034_001E,B07001_017E,B25024_002E,B25010_001E,B19013_001E,B01001H_001E,B25039_001E,B25024_005E,B25002_002E,B25024_006E,B25036_012E,B25027_002E,B25024_007E,B01002_001E,B25027_010E,B25024_003E,B25026_001E,B03002_003E,B11001_001E,B25024_008E,B03002_006E,B25077_001E,B25024_009E&for=block+group:*&in=state:06+county:075+tract:20300&key=77393a88b95432a421e17611c44b7b24fee721cf'
r = requests.get(url)
end = timer()
print(r.url)
print(end - start)    


results = r.json()
labels = results[0]
tract_data=pd.DataFrame(results[1:])
print(labels,tract_data)

  

http://api.census.gov/data/2014/acs5?get=B25036_002E,B25024_003E,B05002_013E,B03002_012E,B25035_001E,B25024_004E,B03002_004E,B25039_002E,B25034_001E,B07001_017E,B25024_002E,B25010_001E,B19013_001E,B01001H_001E,B25039_001E,B25024_005E,B25002_002E,B25024_006E,B25036_012E,B25027_002E,B25024_007E,B01002_001E,B25027_010E,B25024_003E,B25026_001E,B03002_003E,B11001_001E,B25024_008E,B03002_006E,B25077_001E,B25024_009E&for=block+group:*&in=state:06+county:075+tract:20300&key=77393a88b95432a421e17611c44b7b24fee721cf
2.747545316000469
['B25036_002E', 'B25024_003E', 'B05002_013E', 'B03002_012E', 'B25035_001E', 'B25024_004E', 'B03002_004E', 'B25039_002E', 'B25034_001E', 'B07001_017E', 'B25024_002E', 'B25010_001E', 'B19013_001E', 'B01001H_001E', 'B25039_001E', 'B25024_005E', 'B25002_002E', 'B25024_006E', 'B25036_012E', 'B25027_002E', 'B25024_007E', 'B01002_001E', 'B25027_010E', 'B25024_003E', 'B25026_001E', 'B03002_003E', 'B11001_001E', 'B25024_008E', 'B03002_006E', 'B25077_001E', 'B25024_009E', 'st

In [54]:

# for each year, for each dataset, for each tract
#years = ['2014','2012','2010','2009','2000']
years = ['2014','2012','2009']
full_data = dict.fromkeys(years)

for yr in years:
    full_data[yr] = {}
    for ds in my_vars[yr].keys():

        print(yr, ds)
        data = pd.DataFrame()
        for tract in tracts_by_year[yr]:
            url = make_url(year=yr, dataset=ds, tract=tract)
            #print(url)
            r = requests.get(url)
            #print(tract, r.status_code)
            
            results = r.json()
            # keep column labels to match them up with variable names 
            labels = results[0]
            # separate data from labels so can loop through tracts. 
            tract_data=pd.DataFrame(results[1:])
            data = pd.concat([data,tract_data])
        data.columns = labels
        full_data[yr][ds] = data


2014 acs5
2012 2012_acs5
2009 acs5


In [55]:
# match variable names to labels so they are easier to read. 
# names are stored in my_vars
for yr in full_data.keys():
    for ds in full_data[yr].keys():
        df = full_data[yr][ds]
        rev_vars = dict((v,k) for k,v in my_vars[yr][ds].items())  # have to reverse lookup in dictionary
        new_cols = []
        for col in df.columns:
            if col[0] in ['P','H','B']:   # only rename variables
                new_name = rev_vars[col]
                new_cols.append(new_name)
            else:
                new_cols.append(col)   # tract, bg names remain the same. 
        df.columns = new_cols
        full_data[yr][ds] = df

In [56]:
# save each dataset as csv file. 
for yr in full_data.keys():
    for ds in full_data[yr].keys():
        df = full_data[yr][ds]
        print(len(df))  # check length
        fname = 'census_data_{}_{}.csv'.format(yr,ds)
        df.to_csv(fname, index=False)

581
575
581


## Some more processing, after we've gotten the data

In [68]:
# actually we really just want a set for each year. 
# and we need to use percentages for the explanatory variables, not totals

# To prepare for merging, construct GEOID column. 
# Construct geoid. format: '060750157003'
def make_geoid_field(df):
    df['geoid'] = df['state']+df['county']+df['tract']+df['block group']
    return df


    

In [69]:
# function to calculate percentages for each variable. 
def calc_percentages(df):
    # denominator depends on the variable
    hh_vars = ['families','same_house_5','foreign_born','same_house_1']  #/'tot_hhs',
    hu_vars =['owned','occ_hu','rented','hu_attached','hu_5-9','hu_3-4','hu_2','mortgage','hu_10-19','no_mort','hu_detatched','hu_20-49','hu_50'] # / 'tot_hu',
    pop_vars =['hispanic','asian','black','white','tot_pop_occ_units']  #  /'tot_pop'
    
    for col in df.columns: 
        #new_name = col+'_pct'
        if col in hh_vars:
            df[col] = df[col]/df.tot_hhs
        elif col in hu_vars: 
            df[col] = df[col]/df.tot_hu
        elif col in pop_vars: 
            df[col] = df[col]/df.tot_pop
    # drop originals 
    #cols_to_drop = hh_vars+hu_vars+pop_vars
    #df_new = df.drop(cols_to_drop, axis=1)
    return(df)
        

In [70]:
# combine df_2000 data.
df_2000_sf1 = pd.read_csv('census_data_2000_sf1.csv', dtype={'state':str,'county':str,'tract':str,'block group':str})
df_2000_sf3 = pd.read_csv('census_data_2000_sf3.csv', dtype={'state':str,'county':str,'tract':str,'block group':str})

In [71]:
# for 2000: 
df_2000_sf1 = make_geoid_field(df_2000_sf1)
df_2000_sf3 = make_geoid_field(df_2000_sf3)
df_2000_sf1.head()
df_2000_sf3.head()

df_2000_sf1.columns

# merge on geoid
df_2000_sf3.drop(['state','county','tract','block group'],axis=1, inplace=True)  # drop duplicate cols
df_2000 = pd.merge(df_2000_sf1, df_2000_sf3, on='geoid')
df_2000.columns


Index(['owned', 'hispanic', 'occ_hu', 'rented', 'asian', 'black', 'tot_hu',
       'med_age', 'families', 'tot_pop_occ_units', 'white', 'tot_hhs',
       'avg_hh_size', 'tot_pop', 'state', 'county', 'tract', 'block group',
       'geoid', 'same_house_5', 'med_yr_moved_all', 'hu_attached', 'hu_5-9',
       'hu_3-4', 'med_inc', 'hu_2', 'mortgage', 'hu_10-19',
       'med_yr_moved_owner', 'foreign_born', 'no_mort', 'hu_detatched',
       'med_yr_built', 'hu_20-49', 'med_value', 'hu_50'],
      dtype='object')

In [72]:
# combine 2010 and 2012 acs
df_2012_acs = pd.read_csv('census_data_2012_2012_acs5.csv', dtype={'state':str,'county':str,'tract':str,'block group':str})
df_2010_sf1 = pd.read_csv('census_data_2010_sf1.csv', dtype={'state':str,'county':str,'tract':str,'block group':str})
df_2012_acs = make_geoid_field(df_2012_acs)
df_2010_sf1 = make_geoid_field(df_2010_sf1)


# merge on geoid
df_2012_acs.drop(['state','county','tract','block group'],axis=1, inplace=True)  # drop duplicate cols
df_2010 = pd.merge(df_2010_sf1, df_2012_acs, on='geoid')
df_2010.columns

Index(['hispanic', 'occ_hu', 'rented', 'asian', 'mortgage', 'black', 'tot_hu',
       'med_age', 'no_mort', 'families', 'tot_pop_occ_units', 'white',
       'tot_hhs', 'avg_hh_size', 'tot_pop', 'state', 'county', 'tract',
       'block group', 'geoid', 'med_value', 'hu_attached', 'same_house_1',
       'hu_20-49', 'hu_50', 'med_inc', 'med_yr_built', 'hu_2',
       'med_yr_moved_owner', 'hu_detatched', 'hu_3-4', 'med_yr_moved_all',
       'owned', 'hu_10-19', 'foreign_born', 'hu_5-9'],
      dtype='object')

In [73]:
df_2009_acs = pd.read_csv('census_data_2009_acs5.csv',dtype={'state':str,'county':str,'tract':str,'block group':str})
df_2014_acs = pd.read_csv('census_data_2014_acs5.csv',dtype={'state':str,'county':str,'tract':str,'block group':str})
#print(df_2009_acs.columns)
#print(df_2014_acs.columns)
#df_2014_acs['tot_pop'].head()


In [74]:
# turn variables into percentages
df_2000_new = calc_percentages(df_2000)
df_2010_new = calc_percentages(df_2010)
df_2009_new = calc_percentages(df_2009_acs)
df_2014_new = calc_percentages(df_2014_acs)

In [75]:
df_2010_new.to_csv('census_data_2010.csv', index=False)
df_2000_new.to_csv('census_data_2000.csv', index=False)


# rename ACS data files so the year is the middle year, not the end year
df_2009_new.to_csv('census_data_2007.csv', index=False)
df_2014_new.to_csv('census_data_2012.csv', index=False)

In [67]:
# Don't need this anymore
# rename ACS data files so the year is the middle year, not the end year
#
#from shutil import copyfile
#copyfile('census_data_2009_acs5.csv','census_data_2007.csv')
#copyfile('census_data_2014_acs5.csv','census_data_2012.csv')

'census_data_2012.csv'

### 