In [1]:
import pandas as pd
import geopandas as gp

## Updating CD Pop and Adjusted Districts

In [2]:
# Load in the new BAF file
updated_baf_boundary = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv")

  updated_baf_boundary = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv")


In [3]:
# Create a CD ID
updated_baf_boundary["CONG-ID"] = updated_baf_boundary["STATE"]+"-"+updated_baf_boundary["CONG"].astype(str)

# Groupby this ID
cong_totals = updated_baf_boundary.groupby(["CONG-ID"]).sum()

# Reset index, drop columns, cast pop data to integer and remove "NO VALUE" districts
cong_totals.reset_index(drop = False, inplace = True)
cong_totals.drop(["GEOID20"], axis = 1, inplace = True)
cong_totals["P0010001"] = cong_totals["P0010001"].astype(int)
cong_totals = cong_totals[~cong_totals["CONG-ID"].str.contains("NO")]

In [4]:
# Load in old file as a check
old_cong_file = pd.read_csv("/Users/peterhorton/Documents/RDH/Support/Processing-Requests/Updated_District_Pops_10_18_2022/raw-from-source/cd_pop_2022_csv/cd_pop_2022_csv.csv")

In [5]:
# Define a dictionary to make the join to the older file work
cong_dict = {"2801":"1",
"2802":"2",
"2803":"3",
"2804":"4"}

# Join the two together
old_cong_file["DISTRICT"] = old_cong_file["DISTRICT"].map(cong_dict).fillna(old_cong_file["DISTRICT"])
old_cong_file["CONG-ID"] = old_cong_file["STATE"] + "-" + old_cong_file["DISTRICT"].astype(str).str.upper()
combined = pd.merge(cong_totals, old_cong_file, how = "outer", on = "CONG-ID", indicator = True)

# Confirm that everything joins
print(combined[combined["_merge"]!="both"])

Empty DataFrame
Columns: [CONG-ID, P0010001_x, STATE, DISTRICT, CD_ID, P0010001_y, _merge]
Index: []


In [6]:
# See if there are any differences across the two files
combined["Pop_Diff"] = combined["P0010001_x"] - combined["P0010001_y"]
combined["Pop_Diff"].value_counts()

0    435
Name: Pop_Diff, dtype: int64

In [7]:
# Clean the columns
cong_totals["STATE"] = cong_totals["CONG-ID"].apply(lambda x: x.split("-")[0])
cong_totals["DISTRICT"] = cong_totals["CONG-ID"].apply(lambda x: "-".join(x.split("-")[1:]))

# Filter the columns
cong_totals = cong_totals[["STATE","DISTRICT","CONG-ID","P0010001"]]

# Final population check
sum(cong_totals["P0010001"])

# Export to csv
cong_totals.to_csv("./cong_totals.csv", index = False)

## Adjusted District Populations

In [8]:
# Load in the national BAF
national_baf = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv", dtype =({"GEOID20":str, "STATEAB":str, "CONG":str, "SLDU":str, "SLDL":str, "FLOTERIAL":str}))

# Create columns for the various districts
national_baf["UNQ_CONG_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["CONG"].astype(str)
national_baf["UNQ_SLDL_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["SLDL"].astype(str)
national_baf["UNQ_SLDU_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["SLDU"].astype(str)

In [9]:
# Clean the GEOID column
national_baf["GEOID20"] = national_baf["GEOID20"].astype(str).str.zfill(16)

In [10]:
adjusted_data_state_subset = ['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NJ', 'NV',
'NY', 'PA', 'VA', 'WA']

def mod_census(block_id):
    block_id = str(block_id)
    
    # PA appends a letter to the GEOID for the split blocks
    if "A" in block_id or "B" in block_id or "C" in block_id:
        
        # Return the GEOID with out the split so the blocks can be combined
        return block_id[:len(block_id)-1]
    
    # If it's not one of these special blocks, just return the GEOID
    else:
        return block_id

In [11]:
# Create a list to store the state data
adjusted_data_list = []

# Iterate over the states
for state in adjusted_data_state_subset:
    
    # Load and filter the data
    adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")
    adj_state = adj_state[["GEOID20", "Adj_Pop"]]
    
    # Deal with PA split blocks
    if state == "PA":
        
        # Use the above function to return the "unsplit" GEOID
        adj_state["mod_GEOID20"] = adj_state["GEOID20"].apply(lambda x: mod_census(x))
        
        # Because the splits blocks are in the same districts, we can join them together to match PL geographies
        adj_state_mod = adj_state.groupby("mod_GEOID20").sum()
        
        # Clean the index and rename columns to match others
        adj_state_mod.reset_index(drop = False, inplace = True)
        adj_state_mod.rename(columns = {"mod_GEOID20":"GEOID20"}, inplace = True)
        adj_state_mod = adj_state_mod[["GEOID20", "Adj_Pop"]]
        
        # Append the PA data to the list
        adjusted_data_list.append(adj_state_mod)
    
    # For other states just add the data
    else:
        adjusted_data_list.append(adj_state)

  adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")


In [12]:
# Transform the list to the dataframe
adj_state_data_df = pd.concat(adjusted_data_list)

# Clean the columns
adj_state_data_df["Adj_Pop"] = adj_state_data_df["Adj_Pop"].astype(int)
adj_state_data_df["GEOID20"] = adj_state_data_df["GEOID20"].astype(str).str.zfill(16)

In [None]:
# Join the two files together
adjusted_counts = pd.merge(national_baf, adj_state_data_df, how = "outer", on = "GEOID20", indicator = True)

# Check the join
adjusted_counts["_merge"].value_counts()

In [None]:
# Confirm that there is no population for any of the unjoined blocks
sum(adjusted_counts[adjusted_counts["_merge"]=="right_only"]["Adj_Pop"])

In [None]:
# Filter down to join blocks or blocks in RI (need that to get the district data for the state)
joined = adjusted_counts[(adjusted_counts["_merge"]=="both") | (adjusted_counts["STATE"]=="RI")]

# Clean the columns
joined["Adj_Pop"] = joined["Adj_Pop"].fillna(0)
joined["Adj_Pop"] = joined["Adj_Pop"].astype(int)

In [None]:
len(joined["STATE"].unique())

In [None]:
joined["STATE"].unique()

In [None]:
# Create a subset of states that use adjusted data for congressional redistricting
uses_cong = joined[joined["STATE"].isin(["CA", "MD", "NJ", "NV", "RI", "VA", "WA"])]

# Aggregate to the appropriate district levels
joined_cong = uses_cong.groupby("UNQ_CONG_DIST_ID").sum()
joined_sldl = joined.groupby("UNQ_SLDL_DIST_ID").sum()
joined_sldu = joined.groupby("UNQ_SLDU_DIST_ID").sum()

# Clean the aggregations
joined_cong.reset_index(inplace = True, drop = False)
joined_sldl.reset_index(inplace = True, drop = False)
joined_sldu.reset_index(inplace = True, drop = False)

joined_cong.drop(["P0010001"], axis = 1, inplace = True)
joined_sldl.drop(["P0010001"], axis = 1, inplace = True)
joined_sldu.drop(["P0010001"], axis = 1, inplace = True)

joined_cong.columns = ["ID", "Adj_Pop"]
joined_sldl.columns = ["ID", "Adj_Pop"]
joined_sldu.columns = ["ID", "Adj_Pop"]

joined_cong["Level"] = "CONG"
joined_sldl["Level"] = "SLDL"
joined_sldu["Level"] = "SLDU"

# Join them back into one file
combined_files = pd.concat([joined_cong, joined_sldl, joined_sldu])

# Get the state abbreviation
combined_files["State"] = combined_files["ID"].apply(lambda x: x[0:2])

In [None]:
# Add in a leading zero for the RI data so it will join
#ri_update_dict = {"RI-1":"RI-01","RI-2":"RI-02"}

# Apply this update
#combined_files["ID"] = combined_files["ID"].map(ri_update_dict).fillna(combined_files["ID"])

In [None]:
# Create an ID of the level and the ID so we can join to RI
combined_files["unique_id"] = combined_files["Level"]+"-"+combined_files["ID"]

In [None]:
ri_data[ri_data["Level"]=="CONG"]

In [None]:
# Load in the RI data
ri_data = pd.read_csv("./raw-from-source/ri_sizes.csv",dtype={"Number":str, "Adj_Pop":int, "Level":str})

# Create a unique ID to join with the pop. file
ri_data["unique_id"] = ri_data["Level"]+"-RI-"+ri_data["Number"].astype(str)

# Make the population an integer
ri_data["Adj_Pop"] = ri_data["Adj_Pop"].astype(int)

# Create a dictionary mapping from district ID to population in RI
ri_data_dict = dict(zip(ri_data["unique_id"], ri_data["Adj_Pop"]))

# Apply the above dictionary to the RI districts in the combined file
combined_files["Adj_Pop"] = combined_files["unique_id"].map(ri_data_dict).fillna(combined_files["Adj_Pop"])

In [None]:
combined_files["state-level"] = combined_files["State"]+"-"+combined_files["Level"]


In [None]:
combined_files["Adj_Pop"] = combined_files["Adj_Pop"].astype(int)


In [None]:
combined_files[combined_files["Adj_Pop"]==0]

In [None]:
# Remove the "No Data" districts (these were unassigned blocks we kept in the BAF)
combined_files = combined_files[~combined_files["ID"].str.contains("NO")]

In [None]:


combined_files.drop(["unique_id","state-level"], axis = 1, inplace = True)
combined_files["ID"] = combined_files["ID"].apply(lambda x: x.split("-")[1])

combined_files_join = combined_files.copy(deep = True)

In [None]:
# Final Cleaning
combined_files.columns = ["DISTRICT", "ADJ_POP", "LEVEL", "STATE"]
combined_files = combined_files[["STATE", "DISTRICT", "LEVEL", "ADJ_POP"]]

In [None]:
combined_files = combined_files.sort_values(["STATE", "LEVEL", "DISTRICT"])

In [None]:

combined_files.to_csv("./adjusted_districts_pop.csv", index = False)

In [None]:
## Check against old

In [None]:
old_adjusted_pops = pd.read_csv("./raw-from-source/national_districts_adjusted_pop/adjusted_districts_pop.csv")

combined_files_join["DIST-ID"] = combined_files_join["State"] +"-"+combined_files_join["Level"] +"-"+  combined_files_join["ID"].astype(str).str.zfill(3)

In [None]:
old_adjusted_pops["DIST-ID"] = old_adjusted_pops["State"]+ "-" + old_adjusted_pops["Level"] + "-" + old_adjusted_pops["ID"].astype(str).astype(str).str.zfill(3)

In [None]:
joined_adjusted = pd.merge(combined_files_join, old_adjusted_pops, how = "outer", on = "DIST-ID", indicator = True)

In [None]:
joined_adjusted["_merge"].value_counts()

In [None]:
joined_adjusted["Difference"] = joined_adjusted["Adj_Pop_x"] - joined_adjusted["Adj_Pop_y"]

In [None]:
joined_adjusted[joined_adjusted["Difference"]!=0]

In [None]:
sum(joined_adjusted[joined_adjusted["Difference"]!=0]["Difference"])

In [None]:
joined_adjusted[joined_adjusted["Difference"]!=0][["DIST-ID","Difference"]]