In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from functools import reduce

import sys; sys.path.append("..") # Adds parent directory to python modules path.
from topdown_parsers import *

*** Race Key ***

    White: 1
    Black: 2
    American Indian or Alaska native: 3
    Asian : 4
    Hawaiian : 5
    Other, Multiracial: 6

In [2]:
# with_hh_dirname = "./with_hhs/"
without_hh_dirname = "./without_hhs/"
# dallas_filename = "DALLAS.dat"
# precinct_assignments_fp = "block_prec_assign.csv"
# state_id = 48

In [None]:
def get_precinct_assignments(precinct_assignments_fp):
    """ Reads `precinct_assignments_fp` as Pandas DataFrame, cleans it and returns it.
    """
    precinct_assignments = pd.read_csv(precinct_assignments_fp)
    precinct_assignments.columns = ["GEOID10", "Precinct"]
    precinct_assignments["GEOID10"] = precinct_assignments["GEOID10"].astype(str)
    return precinct_assignments
        
def clean_df(df):
    """ Some simple cleanups on `df` to ready it to make ER csvs.
    """
    df["Enumdist"] = df["Enumdist"].astype(str).str.pad(width=11, side='left', fillchar='0')
    df["County"] = df["County"].astype(str).str.pad(width=3, side='left', fillchar='0')
    df["GEOID10"] = df["State"].astype(str) + df["County"] + df["Enumdist"]
    df["GEOID10"] = df["GEOID10"].str[:11] + df["GEOID10"].str[-4:]
    df = df.drop(columns=["State"])
    df = df.fillna(0)
    
    return df

def rename_cols(df, string):
    """ Renames each column in `df` named Run_x as (x-1)_`string`_noise. 
        Eg if string is `HVAP` and a column is called Run_1, it would be renamed as 
        "0_HVAP_noise".
    """
    df = df.fillna(0)
    for col_name in df.columns:
        if col_name[:3] == "Run":
            new_name = str(int(col_name[4:]) - 1) + "_{}_noise".format(string)
            df = df.rename(columns={col_name: new_name})
    return df

def build_er_df(dir_name, state_id, filename, precinct_assignments_fp, county_fips):
    """ Builds a CSV that can be fed into plot_elect_grid() to easily make ER plots.
        
        Args:
            dir_name (str) : Filepath where `filename` exists.
            state_id (int) : FIPS code of state the runs were run on.
            filename (str) : Name of MDF file in `dir_name` to open.
            precinct_assignments_fp (str) : Filepath to where the precinct_assignments file is.
    """
    print("Collecting races...")
    hisp = collect_by_enumdist(dir_name, state_id, filename, hisp=True, vap=False)
    nh_white = collect_by_enumdist(dir_name, state_id, filename, race=1, vap=False)
    nh_black = collect_by_enumdist(dir_name, state_id, filename, race=2, vap=False)
    nh_amin = collect_by_enumdist(dir_name, state_id, filename, race=3, vap=False)
    nh_asian = collect_by_enumdist(dir_name, state_id, filename, race=4, vap=False)
    nh_hawaiian = collect_by_enumdist(dir_name, state_id, filename, race=5, vap=False)
    nh_other = collect_by_enumdist(dir_name, state_id, filename, race=6, vap=False)
    
    hvap = collect_by_enumdist(dir_name, state_id, filename, hisp=True, vap=True)
    wvap = collect_by_enumdist(dir_name, state_id, filename, race=1, vap=True)
    bvap = collect_by_enumdist(dir_name, state_id, filename, race=2, vap=True)
    amin_vap = collect_by_enumdist(dir_name, state_id, filename, race=3, vap=True)
    asian_vap = collect_by_enumdist(dir_name, state_id, filename, race=4, vap=True)
    hawaiian_vap = collect_by_enumdist(dir_name, state_id, filename, race=5, vap=True)
    other_vap = collect_by_enumdist(dir_name, state_id, filename, race=6, vap=True)
    
    vap = collect_by_enumdist(dir_name, state_id, filename, vap=True)
    
    print("Cleaning...")
    # clean
    [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, 
     hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap] = [clean_df(df) for df in [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]]
    
    print("Merging with precincts...")
    # merge with precincts
    precinct_assignments = get_precinct_assignments(precinct_assignments_fp)
    
    def bell_cameron_processing(df, county_fips):
        """
        """
        df["State"] = "48"
        df["County"] = county_fips
        df["tract"] = df["GEOID10"].apply(lambda x: x[5:11])
        df["tract"] = df["tract"].apply(lambda x: x[:5])
        df["tract"] = df["tract"].str.pad(6, side='left', fillchar='0')
        df["block"] = df["GEOID10"].apply(lambda x: x[-4:])
        df["GEOID10"] = df["State"] + df["County"] + df["tract"] + df["block"]
        df = df.merge(precinct_assignments, on="GEOID10", how="outer")
        df = df[~df["Precinct"].isna()]
        return df
    
    def nueces_processing(df, county_fips):
        """
        """
        df["State"] = "48"
        df["County"] = county_fips
        df["tract"] = df["GEOID10"].apply(lambda x: x[5:11])
        df["tract"] = df["tract"].apply(lambda x: x[:4])
        df["tract"] = df["tract"].str.pad(6, side='left', fillchar='0')

        df.loc[(df.tract == '009800'),'tract'] = '980000'
        df.loc[(df.tract == '009900'),'tract'] = '990000'
        df.loc[(df.tract == '005000'),'tract'] = '000500'
        df.loc[(df.tract == '007000'),'tract'] = '000700'
        df.loc[(df.tract == '008000'),'tract'] = '000800'
        df.loc[(df.tract == '009000'),'tract'] = '000900'

        df["block"] = df["GEOID10"].apply(lambda x: x[-4:])
        df["GEOID10"] = df["State"] + df["County"] + df["tract"] + df["block"]
        df = df.merge(precinct_assignments, on="GEOID10", how="outer")
        df = df[~df["Precinct"].isna()]
        return df
    
    if county_fips == "355":
        [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, 
         hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap] = [nueces_processing(df, county_fips) for df in [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]]
    elif county_fips == "061" or county_fips == "027":
        [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, 
         hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap] = [bell_cameron_processing(df, county_fips) for df in [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]]
    else:
        hisp = hisp.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_white = nh_white.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_black = nh_black.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_amin = nh_amin.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_asian = nh_asian.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_hawaiian = nh_hawaiian.merge(precinct_assignments, on="GEOID10", how="outer")
        nh_other = nh_other.merge(precinct_assignments, on="GEOID10", how="outer")

        hvap = hvap.merge(precinct_assignments, on="GEOID10", how="outer")
        wvap = wvap.merge(precinct_assignments, on="GEOID10", how="outer")
        bvap = bvap.merge(precinct_assignments, on="GEOID10", how="outer")
        amin_vap = amin_vap.merge(precinct_assignments, on="GEOID10", how="outer")
        asian_vap = asian_vap.merge(precinct_assignments, on="GEOID10", how="outer")
        hawaiian_vap = hawaiian_vap.merge(precinct_assignments, on="GEOID10", how="outer")
        other_vap = other_vap.merge(precinct_assignments, on="GEOID10", how="outer")
        vap = vap.merge(precinct_assignments, on="GEOID10", how="outer")

    # rename
    def rename_multiple_dfs(dfs, names):
        assert(len(dfs) == len(names))
        
        new_dfs = []
        for i in range(len(dfs)):
            new_dfs.append(rename_cols(dfs[i], names[i]))
        
        return new_dfs
    
    dfs = [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]
    
    names = ["HISP", "NHWHITE", "NHBLACK", "NHAMIN", "NHASIAN", "NHHAWAIIAN", "NHOTHER", 
             "HVAP", "WVAP", "BVAP", "AMINVAP", "ASIANVAP", "HAWAIIANVAP", "OTHERVAP", "VAP"]
    
    print("Renaming cols...")
    [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap] = rename_multiple_dfs(dfs, names)
    
    
    # groupby Precincts
    dfs = [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]
    
    def groupby_precincts(dfs):
        new_dfs = []
        for df in dfs:
            new_dfs.append(df.groupby("Precinct").sum())
        return new_dfs
            
    print("Grouping by precincts..")
    [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap] = groupby_precincts(dfs)
    
    dfs = [hisp, nh_white, nh_black, nh_amin, nh_asian, nh_hawaiian, nh_other, hvap, wvap, bvap, amin_vap, asian_vap, hawaiian_vap, other_vap, vap]
    
    print("The big merge...")
    df_merged = reduce(lambda left,right: pd.merge(left, right, how='outer', left_index=True, right_index=True), 
                       dfs)
    return df_merged

def get_pops(dir_name, state_id, filename):
    """ Gets the population from each file `filename` in the "output_" dirs in `dirname`, 
        cleans it and returns it in a Pandas DataFrame.
        State ID is the FIPS code of the state where the runs are run on.
    """
    pops = collect_by_enumdist(dir_name, state_id, filename)
    pops = clean_df(pops)
    return pops

def save_populations_at_block_level(runs_dirname, 
                                    zips_arr, 
                                    county_filenames,
                                    state_id=48):
    """ Save the population at the blocks level for the runs in `zips_arr`.
        
        Args:
            runs_dirname (str) : directory where the files in `zips_arr` are stored.
            zips_arr (list str): list of zip files that contain the runs 
            state_id     (int) : FIPS code of state the runs are from. Defaults to 48 for TX
            dallas_filename (str) : Name of Dallas MDF file in each output dir
    """
    def bell_cameron_processing(df, county_fips):
        df["State"] = "48"
        df["County"] = county_fips
        df["tract"] = df["GEOID10"].apply(lambda x: x[5:11])
        df["tract"] = df["tract"].apply(lambda x: x[:5])
        df["tract"] = df["tract"].str.pad(6, side='left', fillchar='0')
        df["block"] = df["GEOID10"].apply(lambda x: x[-4:])
        df["GEOID10"] = df["State"] + df["County"] + df["tract"] + df["block"]
        return df
    
    def nueces_processing(df):
        """
        """
        df["State"] = "48"
        df["County"] = "355"
        df["tract"] = df["GEOID10"].apply(lambda x: x[5:11])
        df["tract"] = df["tract"].apply(lambda x: x[:4])
        df["tract"] = df["tract"].str.pad(6, side='left', fillchar='0')

        df.loc[(df.tract == '009800'),'tract'] = '980000'
        df.loc[(df.tract == '009900'),'tract'] = '990000'
        df.loc[(df.tract == '005000'),'tract'] = '000500'
        df.loc[(df.tract == '007000'),'tract'] = '000700'
        df.loc[(df.tract == '008000'),'tract'] = '000800'
        df.loc[(df.tract == '009000'),'tract'] = '000900'

        df["block"] = df["GEOID10"].apply(lambda x: x[-4:])
        df["GEOID10"] = df["State"] + df["County"] + df["tract"] + df["block"]
        return df
    
    for run in zips_arr:
        filename = run[:-8]
        print(filename)
        extract_from_zip(runs_dirname + run, runs_dirname)   
        for county in county_filenames:
            print(county)
            fp = runs_dirname + "five_counties/" + run[:-4]
            print(fp)
            tot_pops = get_pops(fp, state_id, county)
            if county == "Nueces.dat":
                tot_pops = nueces_processing(tot_pops)
            elif county == "Bell.dat":
                tot_pops = bell_cameron_processing(tot_pops, "027")
            elif county == "Cameron.dat":
                tot_pops = bell_cameron_processing(tot_pops, "061")
            tot_pops.to_csv(filename + "_" + county + "_block_pops.csv")

        os.system("rm -r ./five_counties_cleaned/five_counties_cleaned")
        os.system("rm -r ./five_counties_cleaned/five_counties")
        
def build_csvs(runs_dirname, county_filenames, precinct_assignments_fps, state_id, county_fips):
    """
    """
    assert(len(county_filenames) == len(precinct_assignments_fps))
    
    for root, dirs, files in os.walk(runs_dirname):
        for file in files:
            if file[-4:] == ".zip":
                print(file)
                texas_fp = root + file
                dirname = texas_fp[:-4]
                extract_from_zip(texas_fp, root)    
                for i in range(len(county_filenames)):
                    if county_filenames[i] != "Cameron.dat":
                        continue
                    print("Doing ", county_filenames[i][:-4], " for ", file[:-8])
                    df = build_er_df(runs_dirname, 
                                     state_id, 
                                     county_filenames[i], 
                                     precinct_assignments_fps[i],
                                     county_fips[i]
                                    )
                    save_filename = file[:-8] + "_" + county_filenames[i][:-4]
                    df.to_csv(save_filename + ".csv")
                print("Deleting {}".format(runs_dirname + runs_dirname[:-1]))
#                 os.system("rm -r " + runs_dirname + runs_dirname[:-1])
                os.system("rm -r ./five_counties_cleaned/five_counties_cleaned")
                os.system("rm -r ./five_counties_cleaned/five_counties")
                print()

In [None]:
# build_csvs(with_hh_dirname, dallas_filename, precinct_assignments_fp, state_id)

In [None]:
# build_csvs(without_hh_dirname, dallas_filename, precinct_assignments_fp, state_id)

In [None]:
# runs_with_hhs = ['TEXAS_STUB_HH_mid_1.ini.zip',
#                  'TEXAS_STUB_HH_eq_1.ini.zip',
#                  'TEXAS_STUB_HH_top_1.ini.zip']

# save_populations_at_block_level("with_hhs/", runs_with_hhs, dallas_filename)

In [None]:
# # five counties pops
# runs = ["five_counties_top.ini.zip", 
#         "five_counties_mid.ini.zip", "five_counties_bottom.ini.zip"]
# county_filenames = ["Nueces.dat", "Bell.dat", "Cameron.dat"]

# save_populations_at_block_level("./five_counties_cleaned/", runs, county_filenames)

In [None]:
# # five counties er
# five_counties_dir = "./five_counties_cleaned/"
# county_fips = ["167", "355", "027", "039", "061"]
# county_filenames = ["Galveston.dat", "Nueces.dat", "Bell.dat", "Brazoria.dat", "Cameron.dat"]
# precinct_assignments_fps = ["galveston_assignments.csv", "nueces_assignments.csv", "bell_assignments.csv",
#                             "brazoria_assignments.csv", "cameron_assignments.csv"]
# state_id = 48

# build_csvs(five_counties_dir, county_filenames, precinct_assignments_fps, state_id, county_fips)

In [12]:
def clean_df(df):
    """ Some simple cleanups on `df` to ready it to make ER csvs.
    """
    df["Enumdist"] = df["Enumdist"].astype(str).str.pad(width=11, side='left', fillchar='0')
    df["County"] = df["County"].astype(str).str.pad(width=3, side='left', fillchar='0')
    df["GEOID10"] = df["State"].astype(str) + df["County"] + df["Enumdist"]
    df["GEOID10"] = df["GEOID10"].str[:11] + df["GEOID10"].str[-4:]
    df = df.drop(columns=["State"])
    df = df.fillna(0)
    
    return df

def get_pops(dir_name, state_id, filename):
    """ Gets the population from each file `filename` in the "output_" dirs in `dirname`, 
        cleans it and returns it in a Pandas DataFrame.
        State ID is the FIPS code of the state where the runs are run on.
    """
    pops = collect_by_enumdist(dir_name, state_id, filename)
    pops = clean_df(pops)
    return pops

def save_populations_at_block_level(runs_dirname, 
                                    zips_arr, 
                                    county_filenames,
                                    state_id=48):
    """ Save the population at the blocks level for the runs in `zips_arr`.
        
        Args:
            runs_dirname (str) : directory where the files in `zips_arr` are stored.
            zips_arr (list str): list of zip files that contain the runs 
            state_id     (int) : FIPS code of state the runs are from. Defaults to 48 for TX
            dallas_filename (str) : Name of Dallas MDF file in each output dir
    """
    for run in zips_arr:
        filename = run[:-8]
        print(filename)
#         extract_from_zip(runs_dirname + run, runs_dirname)   
        for county in county_filenames:
            print(county)
#             fp = runs_dirname + runs_dirname + run[:-4]
            fp = runs_dirname + "remaining_runs/" + run[:-4]
            print(fp)
            tot_pops = get_pops(fp, state_id, county)
            tot_pops.to_csv(filename + "_" + county + "_block_pops.csv", index=False)

        os.system("rm -r " + runs_dirname + runs_dirname)

In [13]:
runs = ['TEXAS_STUB_bottom_1.ini.zip']
# runs = ['TEXAS_STUB_eq_1.ini.zip',
#         'TEXAS_STUB_bottom_1.ini.zip',
#         'TEXAS_STUB_mid_1.ini.zip',]
county_filenames = ["Dallas.dat"]

save_populations_at_block_level("./without_hhs/", runs, county_filenames)

TEXAS_STUB_bottom_1
Dallas.dat
./without_hhs/remaining_runs/TEXAS_STUB_bottom_1.ini
