In [None]:
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np
import time
import os

# CVAP Disaggregation

Function that takes in an RDH processed CVAP file (with appropriately summed columns) and disaggregates to the block-level

In [None]:
# Variable to set the year for the CVAP data
y = "21"

# Define the column names
update_dict = {'P0040001': 'CVAP_TOT' + y,
 'P0040003': 'CVAP_NHS' + y,
 'P0040007': 'CVAP_AIA' + y,
 'P0040008': 'CVAP_ASN' + y,
 'P0040006': 'CVAP_BLK' + y,
 'P0040009': 'CVAP_NHP' + y,
 'P0040005': 'CVAP_WHT' + y,
 'P0040014': 'CVAP_AIW' + y,
 'P0040015': 'CVAP_ASW' + y,
 'P0040013': 'CVAP_BLW' + y,
 'P0040018': 'CVAP_AIB' + y,
 'P0040011': 'CVAP_2OM' + y,
 'P0040002': 'CVAP_HSP' + y,
 'P0020001': 'C_TOT' + y,
 'P0020003': 'C_NHS' + y,
 'P0020007': 'C_AIA' + y,
 'P0020008': 'C_ASN' + y,
 'P0020006': 'C_BLK' + y,
 'P0020009': 'C_NHP' + y,
 'P0020005': 'C_WHT' + y,
 'P0020014': 'C_AIW' + y,
 'P0020015': 'C_ASW' + y,
 'P0020013': 'C_BLW' + y,
 'P0020018': 'C_AIB' + y,
 'P0020011': 'C_2OM' + y,
 'P0020002': 'C_HSP' + y}

In [None]:
def define_multi_cols(df):
    '''
    Function to sum the combined race columns
    '''
    df['C_AIA'+y] = df['C_AIA'+y]+df['C_AIB'+y]+df['C_AIW'+y]
    df['CVAP_AIA'+y] = df['CVAP_AIA'+y]+df['CVAP_AIB'+y]+df['CVAP_AIW'+y]
    df['C_BLK'+y] = df['C_BLK'+y]+df['C_BLW'+y]+df['C_AIB'+y]    
    df['CVAP_BLK'+y] = df['CVAP_BLK'+y]+df['CVAP_BLW'+y]+df['CVAP_AIB'+y]
    df['C_ASN'+y] = df['C_ASN'+y] + df['C_ASW'+y]
    df['CVAP_ASN'+y] = df['CVAP_ASN'+y] + df['CVAP_ASW'+y]
    df['CVAP_2OM'+y] = df['CVAP_2OM'+y] - df['CVAP_AIB'+y] - df['CVAP_AIW'+y] - df['CVAP_BLW'+y] - df['CVAP_ASW'+y]
    df['C_2OM'+y] = df['C_2OM'+y] - df['C_AIB'+y] - df['C_AIW'+y] - df['C_BLW'+y] - df['C_ASW'+y] 
    
    return df

In [None]:
# List of total population columns
tot_cols = ['C_TOT21',
 'CVAP_TOT21']

# List of non-total population columns
non_tot_cols = [
 'C_HSP21',
 'C_NHS21',
 'C_WHT21',
 'C_BLK21',
 'C_AIA21',
 'C_ASN21',
 'C_NHP21',
 'C_2OM21',
 'C_BLW21',
 'C_AIW21',
 'C_ASW21',
 'C_AIB21',
 'CVAP_HSP21',
 'CVAP_NHS21',
 'CVAP_WHT21',
 'CVAP_BLK21',
 'CVAP_AIA21',
 'CVAP_ASN21',
 'CVAP_NHP21',
 'CVAP_2OM21',
 'CVAP_BLW21',
 'CVAP_AIW21',
 'CVAP_ASW21',
 'CVAP_AIB21']

In [None]:
def get_state_bg_cvap(state):
    '''
    Add code to retrieve the bg data
    ''' 
    raise ValueError('Add In Code Here')
    return True
    
def get_state_block_pl(state):
    '''
    Add code to retrieve block-level PL data here
    '''
    raise ValueError('Add In Code Here')
    return True

In [None]:
def run_disagg_changed(state):
    '''Runs a disaggregation to the block-level from CVAP data'''
    
    # Set the state abbreviation to lower case
    state = state.lower()
    
    # Get the block-group CVAP data and block-level PL data
    state_bg_cvap = get_state_bg_cvap(state)
    state_block_pl = get_state_block_pl(state)
    
    # PL-Data: Rename the columns to their proxies using the above
    state_block_pl.rename(columns = update_dict, inplace = True)
    state_block_pl = define_multi_cols(state_block_pl)
    
    # PL-Data: Create a GEOID for block groups
    state_block_pl["BLKGRP"] = state_block_pl["GEOID20"].astype(str).str.zfill(15).str[0:12]
    
    # PL-Data: Create a count variable for number of blocks in block group
    state_block_pl["COUNT"] = 1
    
    # PL-Data: Aggregate the block groups together, clean indices
    state_bg_pl = state_block_pl.groupby(["BLKGRP"]).sum()
    state_bg_pl.reset_index(inplace = True, drop = False)
    
    # CVAP-Data: Clean the GEOID20, call it BLKGRP
    state_bg_cvap["GEOID20"] = state_bg_cvap["GEOID20"].astype(str).str.zfill(12)
    state_bg_cvap.rename(columns = {"GEOID20":"BLKGRP"}, inplace = True)
    
    # Merge the two files together
    merged_data = pd.merge(state_block_pl, state_bg_pl, on = "BLKGRP", how = "left", indicator = "ind_1", suffixes = ["_block","_bg"] )
    merged_data_final = pd.merge(merged_data, state_bg_cvap, on = "BLKGRP", how = "left", indicator = "ind_2")

    # Mapping for total columns
    col_mapping = {"C_TOT21":"P0010001", 'CVAP_TOT21':"P0030001"}
    
    # Iterate over the total columns first
    for val in tot_cols:
        merged_data_final[val+"_DISAGG"] = np.where(merged_data_final[val]==0,0,
                                                    np.where(merged_data_final[val+"_bg"]!=0, (merged_data_final[val+"_block"]/merged_data_final[val+"_bg"]) * merged_data_final[val], 
                                                        np.where(merged_data_final[col_mapping[val]+"_bg"]==0, (1/merged_data_final["COUNT_bg"])*merged_data_final[val],(merged_data_final[col_mapping[val]+"_block"]/merged_data_final[col_mapping[val]+"_bg"]) * merged_data_final[val]) 
                                                            )
                                                    )
                                                                 
    # Iterate over the remaining columns after
    for val in non_tot_cols:
        merged_data_final[val+"_DISAGG"] = np.where(merged_data_final[val]==0,0,
                                                    np.where(merged_data_final[val+"_bg"]!=0, (merged_data_final[val+"_block"]/merged_data_final[val+"_bg"]) * merged_data_final[val], 
                                                        np.where("VAP" in val, (merged_data_final["CVAP_TOT21_DISAGG"]/merged_data_final["CVAP_TOT21"]) * merged_data_final[val],
                                                                    (merged_data_final["C_TOT21_DISAGG"]/merged_data_final["C_TOT21"]) * merged_data_final[val]
                                                                )
                                                            )
                                                        )
    
    # Filter down to the relevant columns
    merged_data_final_export = merged_data_final[["GEOID20_block"]+[i for i in merged_data_final.columns if "_DISAGG" in i]]
    
    # Export the files to a particular path
    if not os.path.exists("./2021_cvap_disagg/"+state+"/"):
        os.mkdir("./2021_cvap_disagg/"+state+"/")
    
    # Export to CSV
    merged_data_final_export.to_csv("./2021_cvap_disagg/"+state+"/"+state+"_2021_cvap_block.csv", index = False)    

In [None]:
# Iterate over the abbreviations
for abbrev in ['al','ak','az','ar','ca','co','ct','de','fl','ga','hi','id','il','in','ia','ks','ky','la','me','md','ma','mi','mn','ms','mo','mt','ne','nv','nh','nj','nm','ny','nc','nd','oh','ok','or','pa','ri','sc','sd','tn','tx','ut','vt','va','wa','wv','wi','wy']:
    # Start timer
    start_process_time = time.time()
    
    # Run disagg
    run_disagg_changed(abbrev)
    
    # End timer
    end_process_time = time.time()
    
    # Write to a .txt file
    print(abbrev + " took " +  str(round(end_process_time - start_process_time,3)) + " seconds ")
    with open('log.txt', 'a') as t:
        t.write(abbrev + " took " +  str(round(end_process_time - start_process_time,3)) + " seconds " + "\n")
    t.close()