In [1]:
import os
import pandas as pd
import maup
import geopandas as gp

import sys; sys.path.append("..") # Adds parent directory to python modules path.
from topdown_parsers import *

In [2]:
reconstructions_dir = "./new_Reconstructed/"

In [3]:
def make_dir(filepath):
    """ Makes a directory at path `filepath`.
    """
    try:
        os.mkdir(filepath)
    except FileExistsError:
        pass

def subdirs(dirname):
    """ Returns the filepaths to all the sub-directories in `dirname`.
    """
    all_dirs = []
    for root, dirs, files in os.walk(reconstructions_dir):
        for d in dirs:
            all_dirs.append(os.path.join(root, d))
    return all_dirs

def modify_race(race):
    """
    """
    if race == "w":
        return 1
    elif race == "b":
        return 2
    elif race == "i":
        return 3
    elif race == "a":
        return 4
    elif race == "h":
        return 5
    else:
        return 6

def compute_pops_and_vaps(df, groupby_cols):
    # make bool columns for the counts
    df["race"] = df["race"].map(lambda race: modify_race(race))

    # sex
    df["male"] = df["sex"] == 0
    df["female"] = df["sex"] == 1

    # voting age
    df["VAP"] = df["age"] >= 18

    df = df.rename(columns={"ethn": "HISP"})

    # ethnicity vap
    df["HVAP"] = (df["HISP"] == 1) & (df["VAP"] == True)

    # race pops
    df["NH_WHITEPOP"] = (df["race"] == 1) &  (df["HISP"] != 1)
    df["NH_BLACKPOP"] = (df["race"] == 2) & (df["HISP"] != 1)
    df["NH_AMINPOP"] = (df["race"] == 3) & (df["HISP"] != 1)
    df["NH_ASIANPOP"] = (df["race"] == 4) & (df["HISP"] != 1)
    df["NH_HAWAIIANPOP"] = (df["race"] == 5) & (df["HISP"] != 1)
    df["NH_OTHERPOP"] = (df["race"] == 6) & (df["HISP"] != 1)

    # race vaps
    df["NH_WVAP"] = (df["race"] == 1) & (df["VAP"] == True) & (df["HISP"] != 1)
    df["NH_BVAP"] = (df["race"] == 2) & (df["VAP"] == True) & (df["HISP"] != 1)
    df["NH_AMINVAP"] = (df["race"] == 3) & (df["VAP"] == True) & (df["HISP"] != 1)
    df["NH_ASIANVAP"] = (df["race"] == 4) & (df["VAP"] == True) & (df["HISP"] != 1)
    df["NH_HAWAIIANVAP"] = (df["race"] == 5) & (df["VAP"] == True) & (df["HISP"] != 1)
    df["NH_OTHERVAP"] = (df["race"] == 6) & (df["VAP"] == True) & (df["HISP"] != 1)
    
    df["TOTPOP"] = 1

    df = df.drop(columns=["sex", "age", "id", "race", "sol"])
    df = df.groupby(groupby_cols).sum().reset_index()
    
    return df

The following is the code that generates counts on a County level for Texas and block level for Dallas. These are the files I send to JN for ToyDown runs, and also what I use in `compare_new_reconstructions_w_census_numbers.ipynb` to compare the difference with census numbers.

In [None]:
# do it for all of Texas
texas_df = read_and_process_reconstructed_csvs(reconstructions_dir)
print("Computing pops and vaps")
texas_df = compute_pops_and_vaps(texas_df, ["state", "county"])
texas_df

In [None]:
# do dallas at the block level
dallas_df = read_and_process_reconstructed_csvs(reconstructions_dir + "Texas_Dallas County")
dallas_df = compute_pops_and_vaps(dallas_df, ["GEOID"])#["state", "county", "tract", "bg", "block"])
dallas_df

In [None]:
texas_df.to_csv("new_texas_reconstructions.csv", index=False)
dallas_df.to_csv("new_dallas_reconstructions.csv", index=False)

In [4]:
# put all of the county in the same enumdist (except for Dallas)
def convert_recons_to_ipums(reconstructions_dir, stubs=False):
    """ 
    """
    
    save_dir = "./stubs_ipums_recons/" if stubs else "./detailed_ipums_recons/"
    make_dir(save_dir)
    
    all_dirs = subdirs(reconstructions_dir)

    for index, d in enumerate(sorted(all_dirs)):
        county_name = d[:-7].split("_")[2] # this index might have to change based on the dirname!
        print()
        print(county_name)
        print("County {} of {}".format(index+1, len(all_dirs)))
        if county_name != "Dallas":
            continue
        
        if stubs:
            convert_reconstructions_to_ipums_same_block(d, save_dir + county_name + ".dat", break_size=1000)
        else:
            convert_reconstructions_to_ipums(d, save_dir + county_name + ".dat", break_size=1000)

At this point, follow the directions in https://www.howtogeek.com/278599/how-to-combine-text-files-using-the-cat-command-in-linux/ to concatenate the countywise .dat files into one file. (Its a simple cat *.dat > output.dat). 

Keep in mind that the stubs directory will not have Dallas County, so you will have to manually copy that from the detailed directory before concatenation.

In [5]:
convert_recons_to_ipums(reconstructions_dir, stubs=False)


Anderson
County 1 of 254

Andrews
County 2 of 254

Angelina
County 3 of 254

Aransas
County 4 of 254

Archer
County 5 of 254

Armstrong
County 6 of 254

Atascosa
County 7 of 254

Austin
County 8 of 254

Bailey
County 9 of 254

Bandera
County 10 of 254

Bastrop
County 11 of 254

Baylor
County 12 of 254

Bee
County 13 of 254

Bell
County 14 of 254

Bexar
County 15 of 254

Blanco
County 16 of 254

Borden
County 17 of 254

Bosque
County 18 of 254

Bowie
County 19 of 254

Brazoria
County 20 of 254

Brazos
County 21 of 254

Brewster
County 22 of 254

Briscoe
County 23 of 254

Brooks
County 24 of 254

Brown
County 25 of 254

Burleson
County 26 of 254

Burnet
County 27 of 254

Caldwell
County 28 of 254

Calhoun
County 29 of 254

Callahan
County 30 of 254

Cameron
County 31 of 254

Camp
County 32 of 254

Carson
County 33 of 254

Cass
County 34 of 254

Castro
County 35 of 254

Chambers
County 36 of 254

Cherokee
County 37 of 254

Childress
County 38 of 254

Clay
County 39 of 254

Cochran
County

In [None]:
# convert_recons_to_ipums(reconstructions_dir, stubs=True)

In [6]:
num_lines_by_type("./stubs_ipums_recons/texas_stubs.dat")

(5039660, 25145559)

In [7]:
len("99999999999")

11