In [1]:
import matplotlib.pyplot as plt # for plotting maps
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np
import time
import requests
import os
from urllib.request import urlopen
import certifi
import ssl
from zipfile import ZipFile
from datetime import date
import shutil
#ef8d7d2d71226a4e4f86b6ee741c8d8f979d6c7b

In [2]:
parent_dir = os.getcwd()
parent_dir

'/Users/adrianarogers/Documents/datahub/partner_data_validation/pdv-oh/mggg-oh'

In [3]:
'''This function gets a file from a url and unzips in the current working directory'''

def get_and_unzip(url, data_dir=os.getcwd()): #os.getcwd() puts the data_dir default as the current working directory
    basename = url.split("/")[-1] #gets the name of what will be the file and downloads it
    name_with_path = os.path.join(data_dir, basename) #appends basename to the set working directory to know where the downloaded is housed
    if not os.path.exists(name_with_path): #if the the data does does not exist in the directory, 
        #print('url: ', url)
        file_data = urlopen(url, context=ssl.create_default_context(cafile=certifi.where()))
        data_to_write = file_data.read()
        with open(name_with_path, "wb") as f:
            f.write(data_to_write)

        zip_obj = ZipFile(name_with_path) #recognizes data downloaded as a zip file
        zip_obj.extractall(data_dir) #extracts zipped data
        del(zip_obj) #deletes the zipped folder

In [4]:
'''This function retrieves the county fips for all counties in a state and returns them in a pandas dataframe.'''
def counties(state_fips):
    """Inputs: state fips code
    Process: Retrieves a list of counties in the given state from the Census API.  
    Outputs: A list of county fips codes in the state. """
    #uses the fips input into the census api
    resp = requests.get(
        "https://api.census.gov/data/2010/dec/sf1"
        "?get=NAME&for=county:*&in=state:{}".format(state_fips)  #uses the fips input to locate the state
    )
    #retrieves the data as a json 
    header, *rows = resp.json()
    #county column is "county"
    county_column_index = header.index("county")
    county_fips = set(row[county_column_index] for row in rows) #sequence of counties 
    county_name_index = header.index("NAME")
    county_names = set(row[county_name_index] for row in rows)
    county_fips = np.array(list(county_fips))
    county_names = np.array(list(county_names))
    df = pd.DataFrame({'COUNTYFP10': county_fips, 'COUNTYNAMES': county_names}) #make pd dataframe of arrays
    df['COUNTY_STATE_FIPS']=state_fips + df['COUNTYFP10']
    return df #returns the fips codes of all counties

In [5]:
'''This function removes supporting geography files if they exist based on a name (input). It can remove a zip folder as well (optional input)'''
def remove_geog_files(name, zip_remove = 'remove_zip'):
    if os.path.exists(name+'.cpg'):
        os.remove(name+'.cpg')
    if os.path.exists(name+'.dbf'):
        os.remove(name+'.dbf')
    if os.path.exists(name+'.prj'):
        os.remove(name+'.prj')
    if os.path.exists(name+'.shp'):
        os.remove(name+'.shp')
    if os.path.exists(name+'.shp.ea.iso.xml'):
        os.remove(name+'.shp.ea.iso.xml')
    if os.path.exists(name+'.shp.iso.xml'):
        os.remove(name+'.shp.iso.xml')
    if os.path.exists(name+'.shx'):
        os.remove(name+'.shx')
    if os.path.exists(name+'.shp.xml'):
        os.remove(name+'.shp.xml')
    if os.path.exists(name+'.xml'):
        os.remove(name+'.xml')
    if zip_remove == 'remove_zip':
        if os.path.exists(name+'.zip'):
            os.remove(name+'.zip')
    if zip_remove == 'keep_zip':
        if os.path.exists(name+'.zip'):
            print('Zipped file is: ', name+'.zip')

In [6]:
'''This function takes a state FIPS code (input) and retrieves the postal code abbreviation for the state'''
def assign_postalcode(fips):
    values = ['01','02','04','05','06','08','09','10','12','13','15','16','17','18','19','20','21','22','23',
                  '24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','44','45','46',
                  '47','48','49','50','51','53','54','55','56']
    keys = ['al','ak','az','ar','ca','co','ct','de','fl','ga','hi','id','il','in','ia','ks','ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm','ny','nc','nd','oh','ok','or','pa','ri','sc','sd','tn','tx','ut','vt','va','wa','wv','wi','wy']
    dictionary = dict(zip(keys,values))
    state_ab = ''
    for key, value in dictionary.items(): 
        if value == fips: 
            state_ab=key
    return state_ab

In [7]:
'''This function takes a file name (without the file type) and zips it into a folder with supporting files given the type of zip file desired (shp or csv as inputs for dtype)'''
def zip_folder(name,dtype):
    zipObj = ZipFile(name+'.zip', 'w')
    if dtype == 'csv':
        zipObj.write(name+'.csv')
        zipObj.close()
        os.remove(name+'.csv')
    if dtype == 'shp':
        zipObj.write(name+'.cpg')
        zipObj.write(name+'.dbf')
        zipObj.write(name+'.prj')
        zipObj.write(name+'.shp')
        zipObj.write(name+'.shx')
        zipObj.close()
        remove_geog_files(name, 'keep_zip')

In [8]:
'''*** This is the list of table numbers of strings that are to be pulled from the 2010 SF1 Decennial Census. This code can be modified to read in a dictionary if you want to do the renaming in the function as well.'''
variables = [
    # pop
    "P001001",
    "P005003",
    "P005004",
    "P005005",
    "P005006",
    "P005007",
    "P005008",
    "P005009",
    "P005010",
    "P005011",
    "P005012",
    "P005013",
    "P005014",
    "P005015",
    "P005016",
    "P005017",
    # vap
    "P011001",
    "P011002",
    "P011005",
    "P011006",
    "P011007",
    "P011008",
    "P011009",
    "P011010",
    "P011011",
]

In [9]:
'''This function takes a state fips, a level of geography (either block 'b' or block group 'bg'), if/how the file is to be saved ('shp', 'csv', 'shp csv', else there is no save), and if the saved data is to be zipped or not ('zip' or else no zip).
Optional inputs are the variables list (defaults to the list in the previous cell, the parent_dir (set at the beginning as the current working directory), and the Census API Key).
The function retrieves census data from given input variables for a state/geog and then returns them as a geodataframe. Saving and zipping files are optional inputs.'''

def get_census_data(fip, geog, save, zipf, variables=variables,parent_dir=parent_dir, CENSUS_API_KEY = 'ef8d7d2d71226a4e4f86b6ee741c8d8f979d6c7b'):
    file_suffix = 'demographics'
    folder_suffix = file_suffix + '_2010'
    HOST = "https://api.census.gov/data"
    # set year for data and acs5 or sf1 (sf1 stands for summary file 1)
    # as of July, 2018 - documentation can be found
    # here: https://www.socialexplorer.com/data/C2010/metadata/?ds=SF1
    year = "2010"
    dataset = "dec/sf1"
    base_url = "/".join([HOST, year, dataset])
    data = []
    if 'NAME' not in variables:
        variables.append('NAME')
    counties_codes_TEST = counties(fip)
    counties_fips = counties_codes_TEST['COUNTY_STATE_FIPS']
    print('starting to collect data for ' + geog + ' ' + fip)
    for county in counties_fips:
    # for county_code in state_county_dict[fip]:
        predicates = {}
        predicates["get"] = ",".join(variables)
        if geog == 'b':
            predicates["for"] = "block:*"
        if geog == 'bg':
            predicates["for"] = "block group:*"
        predicates["in"] = "state:" + fip + "+county:" + county[2:]
        predicates["key"] = CENSUS_API_KEY
            # Write the result to a response object:
        response = requests.get(base_url, params=predicates)
        col_names = response.json()[0]        
        data = data + response.json()[1:]
    print('done collecting data for', fip)
    geoids = []  # initialize geoid vector
    pop_data = pd.DataFrame(columns=col_names, data=data)
    cols = [i for i in pop_data.columns if i not in ["NAME","state","county","tract",'block group',"block"]]
    for col in cols:
        pop_data[col]=pd.to_numeric(pop_data[col])
    for index, row in pop_data.iterrows():
        # make changes here for tracts
        if geog == 'b':
            geoid = row["state"] + row["county"] + row["tract"] + row["block"]
        if geog == 'bg':
            geoid = row["state"] + row["county"] + row["tract"] + row["block group"]
        geoids.append(geoid)
    pop_data["GEOID"] = geoids
    pop_data.set_index(["state", "county", "tract"], drop=False, inplace=True)
    if geog == 'b':
        pop_data['COUNTY']=pop_data['NAME'].apply(lambda x: x.split(',')[3])
        pop_data['STATE']=pop_data['NAME'].apply(lambda x: x.split(',')[4])
        url_block = "https://www2.census.gov/geo/tiger/" \
            "TIGER2010/TABBLOCK/2010/tl_2010_" + fip + "_tabblock10.zip"
    if geog == 'bg':
        pop_data['COUNTY']=pop_data['NAME'].apply(lambda x: x.split(',')[2])
        pop_data['STATE']=pop_data['NAME'].apply(lambda x: x.split(',')[3])
        url_block = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_' + fip + '_150_00_500k.zip'
    get_and_unzip(url_block, os.getcwd())
    if geog == 'b':    
        shp_geog = gp.read_file("tl_2010_" + fip + "_tabblock10.shp")
        shp_geog.rename(columns={'GEOID10':'GEOID','STATEFP10':'STATEFP','COUNTYFP10':'COUNTYFP','TRACTCE10':'TRACTCE','BLOCKCE10':'BLOCKCE','ALAND10':'ALAND','AWATER10':'AWATER'}, inplace=True)
    if geog == 'bg':
        shp_geog = gp.read_file('gz_2010_' + fip + '_150_00_500k.shp')
        shp_geog['GEOID'] = shp_geog['GEO_ID'].str[9:]
        shp_geog.rename(columns={'STATE':'STATEFP','COUNTY':'COUNTYFP','TRACT':'TRACTCE','BLKGRP':'BLKGRPCE','NAME':'x'},inplace=True)
    print('starting merge of data and ' + geog +  ' for ' +  fip)
    geog_data = pd.merge(shp_geog, pop_data, on= "GEOID")
    variables.remove('NAME')
    cols_to_keep=['GEOID','NAME','STATE','COUNTY'] + variables + ['STATEFP','COUNTYFP','TRACTCE','BLKGRPCE','BLOCKCE','ALAND','AWATER','geometry']
    col_list = list(geog_data.columns)
    #print(col_list)
    new_cols = []
    for i in col_list:
        #print(i)
        column = [x for x in cols_to_keep if i in x]
        if len(column) > 0:
            add_col = column[0]
            #print(add_col)
            new_cols.append(add_col)
    geog_data = geog_data[new_cols]
    print(geog_data.columns)
    if geog == 'b':
        reindex = ['GEOID','NAME','STATE','COUNTY'] + variables + ['STATEFP','COUNTYFP','TRACTCE','BLOCKCE','ALAND','AWATER','geometry']
        print('reindex')
        geog_data = geog_data[reindex]
    if geog == 'bg':
        reindex = ['GEOID','NAME','STATE','COUNTY'] + variables + ['STATEFP','COUNTYFP','TRACTCE','BLKGRPCE','geometry']
        geog_data = geog_data[reindex]
    directory = geog+'_'+folder_suffix
    #print('parent_dir: ', parent_dir)
    path = os.path.join(parent_dir, directory)
    #print('folder path: ', path)
    if not os.path.exists(path):
        os.mkdir(path)
    os.chdir(path)
    #print('current shp wd: ', os.getcwd())
    state_ab = assign_postalcode(fip)
    final_cols = geog_data.columns
    name = state_ab + '_2010_' + geog + '_' + file_suffix
    if save == 'shp':
        geog_data.to_file(name+'.shp')
        #zip_folder(name, 'shp')
        print('done merging data and geography for', fip)
        os.chdir(parent_dir)
        if zipf == 'zip':
            zip_folder(name,save)
    if save == 'csv':
        reindex.pop()
        geog_data = geog_data[reindex]
        geog_data = pd.DataFrame(geog_data)
        geog_data.to_csv(name+'.csv')
        geog_data.to_csv(name+'.csv',index=False) 
        if zipf == 'zip':
            zip_folder(name,save)
    if save == 'csv shp':
        geog_data.to_file(name+'.shp')
        print('done merging data and geography for', fip)
        os.chdir(parent_dir)
        if zipf == 'zip':
            shp = 'shp'
            zip_folder(name,shp)
        reindex.pop()
        geog_data = geog_data[reindex]
        geog_data = pd.DataFrame(geog_data)
        geog_data.to_csv(name+'.csv')
        geog_data.to_csv(name+'.csv',index=False) 
        if zipf == 'zip':
            csv = 'csv'
            zip_folder(name,csv)
    os.chdir(parent_dir)
    if geog == 'b':
        name = "tl_2010_" + fip + "_tabblock10"
        remove_geog_files(name, 'remove_zip')
    if geog == 'bg':
        name = "gz_2010_" + fip + "_150_00_500k"
        remove_geog_files(name, 'remove_zip')
    return geog_data

In [10]:
#All states are in this FIPS list to iterate over
fips = ['01','02','04','05','06','08','09','10','12','13','15','16','17','18','19','20','21','22','23',
                  '24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','44','45','46',
                  '47','48','49','50','51','53','54','55','56']
fips = ['39'] #ohio
#These are the two possible values for geog
geog = ['b','bg']
geog = ['b']
#Save has these three options and will either return no file outputs (the function returns a geodataframe), a csv, a shp, or both csv and shp
save = 'shp'

#You can change zipf to 'zip' and it will zip the outputs but probably doesn't make sense since you are manipulating the data after.
zipf = ''

In [11]:
'''This function runs the get_census_data function for all states at the block and block group level and saves them as unzipped shapefiles.
All inputs are optional and default to the defined variables in the previous cell.'''
def run_2010_census(zipf = zipf, save=save,fips=fips,geog=geog):
    for g in geog:
        for f in fips:
            print('NOW STARTING ', f, ' ', g)
            get_census_data(f, g, save, zipf)

In [12]:
run_2010_census()

NOW STARTING  39   b
starting to collect data for b 39
done collecting data for 39
starting merge of data and b for 39
Index(['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLOCKCE', 'GEOID', 'ALAND', 'AWATER',
       'geometry', 'P001001', 'P005003', 'P005004', 'P005005', 'P005006',
       'P005007', 'P005008', 'P005009', 'P005010', 'P005011', 'P005012',
       'P005013', 'P005014', 'P005015', 'P005016', 'P005017', 'P011001',
       'P011002', 'P011005', 'P011006', 'P011007', 'P011008', 'P011009',
       'P011010', 'P011011', 'NAME', 'COUNTY', 'STATE'],
      dtype='object')
reindex
done merging data and geography for 39
