In [None]:
#initialize packages
#import packages
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib
from matplotlib import pyplot as plt
import mapclassify
# To display plots, maps, charts etc in the notebook
%matplotlib inline

# For area weighted interpolation
from tobler import area_weighted
from tobler.area_weighted import area_interpolate, area_tables

#for bringing in census data
import requests

In [None]:
#bringing in the variables that we care about
# race, household income, ternure, population
# need to show how things have changed 1990, 2000, 2010, 2015, 2018

In [None]:
#functions created by Data For Housing UC Berkeley Team

def validate_variables(year, state):
    '''
    Function to validate the year and state variables (user-defined) to avoid
    errors when fetching tabular ang geographic data.
    
    Parameters:
    
        year (int): the year of data (note, not future proof!)
        state (str): 2-digit state FIPS code
        
    Returns:
    
        True if valid, False if not
    
    '''
    years = [2010, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
    states = ['01','02','04','05','06','08','09','10','12','13','15',
              '16','17','18','19','20','21','22','23','24','25','26',
              '27','28','29','30','31','32','33','34','35','36','37',
              '38','39','40','41','42','44','45','46','47','48','49',
              '50','51','53','54','55','56','60','66','69','72','78']
    
    if year not in years:
        print("PROBLEM: `year` not valid. Please enter one of the following: "+ \
              "2010, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019")
        return(False)
    
    if state not in states:
        print("PROBLEM: `state_fips` not valid. Consult https://census.missouri.edu/geocodes/?state=00 " + \
              "and enter the proper 2-digit state FIPS code (string)")
        return(False)
    
    return(True)

def format_cenvars(cenvar_dict):
    '''
    Function to build a list with all the variables for the estimate and 
    margin of error (moe) of the estimate.
    
    Parameters:
    
        cenvar_dict (dict): Dictionary of ACS variables (value) and formatted names (key)
        
    Returns:
    
        get_vars: a list of variables to fetch from the Census API
    
    '''
    # Get all of the estimate variables
    get_vars_est = [v + "E" for v in cenvar_dict.values()]
    
    # Get all of the MOE for the estimate variables
    get_vars_moe = [v + "M" for v in cenvar_dict.values()] 
    
    # Combine all estimate and MOE variables into one list, with 'NAME' field
    get_vars = ["NAME"] + get_vars_est + get_vars_moe
    
    return(get_vars)

def get_predicates(cenvar_dict, state_fips):
    '''
    Function to build a dictionary of predicates to fetch from the Census API. 
    The predicate `get` requests all of the variables specified in `cenvar_dict`.
    The predicate `for` restricts the variables by geography at various levels. Below it is set to
    `tract:*` using the wildcard to fetch all tracts.
    The predicate `in` restricts to geographic area equal to state level.
    
    Parameters:
    
        cenvar_dict (dict): a dictionary of ACS variables (value) and formatted names (key)
        state_fips (str): 2-digit state FIPS code
        
    Returns:
    
        predicates: a dictionary of predicates to fetch from the Census API
    
    '''
    # Define predicates
    predicates= {}
    predicates["get"] = ",".join(format_cenvars(cenvar_dict))
    predicates["for"] = "tract:*"
    predicates["in"] = "state:"+state_fips
    
    return(predicates)

def convert_type_to_numeric(df):
    '''
    Function to convert the dtype of variable columns fetched from the Census API to numeric
    values (int, float). When we pull data from the ACS it tends to come in as an `object` data 
    type, which implies that it's a string. This makes it harder to do calculations. This function 
    is used to clean the dataset before saving to file.
    
    Parameters:
    
        df: a dataframe of ACS 5-year data fetched from the Census API
        
    Returns:
    
        df: a dataframe of ACS 5-year data fetched from the Census API
    
    '''
    # Collect list of all column names
    all_cols = list(df.columns)

    # We will change the type of columns not in the exclude list
    exclude_cols = ['NAME', 'state_fips', 'county_fips', 'tract_fips', 'year', 'FIPS_11_digit']
    change_cols = list(set(all_cols) - set(exclude_cols))

    # Implement the type change
    for c in change_cols:
        try:
            print("Converting", c, "to int")
            df[c]=df[c].astype(int)
            
        except ValueError:
            print("Unable to convert", c, "to int - Converting to float")
            df[c]=df[c].astype(float)
            
        except:
            print("Unable to coerce to int or float")
            pass
        
    return(df)


def replace_null(df):
    '''
    Function to replace placeholder values fetched from the Census API with null values. There may 
    be null values in the dataset for a number of different reasons. A breakdown of how the Census API package 
    recodes these can be found here: https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html
    in the Estimate and Annotation Values section. 
    
    Parameters:
    
        df: a dataframe of ACS 5-year data fetched from the Census API
        
    Returns:
    
        df: a dataframe of ACS 5-year data fetched from the Census API
    
    '''
    # Replace placeholder null values in df with np.nan
    df.replace(-222222222, np.nan, inplace=True) # indicates that either no sample observations or too few sample observations were available to compute a standard error and thus the margin of error.
    df.replace(-333333333, np.nan, inplace=True) # indicates that the median falls in the lowest interval or upper interval of an open-ended distribution.
    df.replace(-555555555, np.nan, inplace=True) # indicates that the estimate is controlled.
    df.replace(-666666666, np.nan, inplace=True) # indicates that either no sample observations or too few sample observations were available to compute an estimate,...
    df.replace(-888888888, np.nan, inplace=True) # the estimate is not applicable or not available.
    df.replace(-999999999, np.nan, inplace=True) # indicates that data for this geographic area cannot be displayed because the number of sample cases is too small.
    
    return(df)

def fetch_census_data(HOST, dataset, cenvars_list, year, state, out_file_name):
    '''
    The function below uses the requests module to get specified parameters (all tracts for a state and 
    variables defined by user) for each specified year (defined by user). It then organizes the pulled 
    data into a pandas dataframe, renames the columns to more readable labels, and creates a new column, 
    `FIPS_11_digit`, with an 11-digit FIPS code for merging data in analysis. Finally, it saves the data 
    to a csv file.
    
    Parameters:
    
        HOST (string): base url for the API
        cenvars_list (list): a list of all the `cenvars_*` dictionaries with variables to fetch
        year (int): the year of ACS-5 data
        state (str): 2-digit state FIPS code
        out_file_name (str): file name for the output csv
        
    Returns:
    
        df: a dataframe of ACS 5-year data fetched from the Census API
    
    '''
    # Start function
    print("...fetching ACS 5-year data")
    
    # Test validity of `year` and `state` variables
    v = validate_variables(year, state)
    if not v:
        return
    
    # Empty list for within year results
    dfs_single = []

    # Build a base url to use for whatever dataset you are trying to pull
    base_url = "/".join([HOST, str(year), dataset])

    # loop over one or more dict of census vars
    for d in cenvars_list:
        
        # create array of formatted column names
        col_names = ['NAME'] + \
            [i for i in d.keys()] + \
            [i + "_moe" for i in d.keys()] + \
            ['state_fips', 'county_fips', 'tract_fips']

        # Fetch the data from the Census API
        r = requests.get(base_url, params = get_predicates(d, state)) 

        # Put the response data in a pandas dataframe
        df = pd.DataFrame(columns = col_names, data = r.json()[1:])

        # Update our list of fetched data for this year
        dfs_single.append(df)

    # merge the dfs if more than one
    if len(dfs_single) > 1:
        df = pd.merge(dfs_single[0], dfs_single[1], on=['NAME', 'state_fips','county_fips','tract_fips'])
        
        for i in range(2,len(dfs_single)):
            df = pd.merge(df, dfs_single[i], on=['NAME', 'state_fips','county_fips','tract_fips'])
    else:
        df = dfs_single[0]

    # Label fetched data with year
    df["year"] = year

    # Add the 11 digit FIPS code
    df['FIPS_11_digit'] = df['state_fips'] + df['county_fips'] + df['tract_fips']
    
    # Clean the dataframe
    print("...cleaning ACS data")
    df = convert_type_to_numeric(df)
    df = replace_null(df)
    
    # Save ACS 5-year data to file as a csv file
    print(f"...saving ACS data to file: {out_file_name}")
    df.to_csv(out_file_name, index=False)
    print("saved")
    
    # Return full dataframe
    return df

In [None]:
#building 1990 dataset

state_fips ='06' #only selecting CA variables

#variables of interest
cenvars_pop = {
    'c_race': 'B03002_001',  # Total race
    'c_white': 'B03002_003', # Total White non-Latinx
    'c_black': 'B03002_004', # Total Black and African American non-Latinx
    'c_asian': 'B03002_006', # Total Asian non-Latinx
    'c_latinx': 'B03002_012' # Total Latinx
}
cenvars_rent = {
    'med_rent': 'B25064_001',  # Median gross rent (dollars)
    'med_hhinc': 'B19013_001', # Median household income (dollars)
    'c_tenants': 'B25003_001',  # Total TENURE
    'c_owners': 'B25003_002',   # Owner occupied TENURE
    'c_renters': 'B25003_003'   # Renter occupied TENURE

cenvars_all = [cenvars_rent, cenvars_pop, ]
# Define the output file path for the ACS 5-year data (CSV file)
acs_out_file = "../outdata/census_variables_"+state_fips+"_"+str(year)+".csv"


In [None]:
# renaming variables

renamed = {'H0010001':'housing_units',
          'H0030001': 'housing_units_oo',
          'H0030002':'housing_units_ro',
          'P0010001': 'pop_total',
           'P0010001': 'hh_total',
          }

In [None]:
#bring in JSON for tracts in California
C:\Users\amarq\Python_Projects\out_data\calitracts.json