In [1]:
import pandas as pd
import geopandas as gp

# Congressional District Population (Nationwide) + Districts using Adjusted Data Population

## 2022 Congressional Districts with Total Population from Census PL file 10/18/2022

### Background:
We received a data request asking for total populations of the 2022 congressional districts.

Note that the following states use adjusted census data when drawing congressional districts: CA, MD, NJ, NV, RI, VA, WA. For the purposes of calculating population deviations, the adjusted population totals (calculated below) should be used.

### Approach:

- Load National BAF file, which contains a 2020 Census PL Total Population field
- Groupby congressional district, and join to the national 2022 congressional file
- Check file  
- Export file

### Links to Download Raw Files 
- [National BAF for 2022 Districts](https://redistrictingdatahub.org/dataset/national-block-assignment-file-for-2022-state-legislative-and-congressional-districts/)

### Processing Steps:
See attached notebook

**Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org for more info.

In [2]:
# Load in the new BAF file
updated_baf_boundary = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv")

# Create a CD ID
updated_baf_boundary["CONG-ID"] = updated_baf_boundary["STATE"]+"-"+updated_baf_boundary["CONG"].astype(str)

# Groupby this ID
cong_totals = updated_baf_boundary.groupby(["CONG-ID"]).sum()

# Reset index, drop columns, cast pop data to integer and remove "NO VALUE" districts
cong_totals.reset_index(drop = False, inplace = True)
cong_totals.drop(["GEOID20"], axis = 1, inplace = True)
cong_totals["P0010001"] = cong_totals["P0010001"].astype(int)
cong_totals = cong_totals[~cong_totals["CONG-ID"].str.contains("NO")]

  updated_baf_boundary = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv")


## Clean File

In [3]:
# Clean the columns
cong_totals["STATE"] = cong_totals["CONG-ID"].apply(lambda x: x.split("-")[0])
cong_totals["DISTRICT"] = cong_totals["CONG-ID"].apply(lambda x: "-".join(x.split("-")[1:]))

# Filter the columns
cong_totals = cong_totals[["STATE","DISTRICT","CONG-ID","P0010001"]]

# Final population check
sum(cong_totals["P0010001"])

330759736

## Check Against Prior File

In [4]:
# Load in old file as a check
old_cong_file = pd.read_csv("./raw-from-source/cd_pop_2022_csv/cd_pop_2022_csv.csv")

# Define a dictionary to make the join to the older file work
cong_dict = {"2801":"1",
"2802":"2",
"2803":"3",
"2804":"4"}

# Join the two together
old_cong_file["DISTRICT"] = old_cong_file["DISTRICT"].map(cong_dict).fillna(old_cong_file["DISTRICT"])
old_cong_file["CONG-ID"] = old_cong_file["STATE"] + "-" + old_cong_file["DISTRICT"].astype(str).str.upper()
combined = pd.merge(cong_totals, old_cong_file, how = "outer", on = "CONG-ID", indicator = True)

# Confirm that everything joins
print(combined[combined["_merge"]!="both"])

# See if there are any differences across the two files
combined["Pop_Diff"] = combined["P0010001_x"] - combined["P0010001_y"]
print(combined["Pop_Diff"].value_counts())

Empty DataFrame
Columns: [STATE_x, DISTRICT_x, CONG-ID, P0010001_x, STATE_y, DISTRICT_y, CD_ID, P0010001_y, _merge]
Index: []
0    435
Name: Pop_Diff, dtype: int64


## Check File On Its Own

In [5]:
# Can compare these numbers against this link: https://www.census.gov/library/visualizations/2021/dec/2020-apportionment-map.html
print(cong_totals["STATE"].value_counts())

CA    52
TX    38
FL    28
NY    26
IL    17
PA    17
OH    15
GA    14
NC    14
MI    13
NJ    12
VA    11
WA    10
TN     9
IN     9
AZ     9
MA     9
WI     8
MO     8
MN     8
CO     8
MD     8
SC     7
AL     7
OR     6
LA     6
KY     6
OK     5
CT     5
IA     4
NV     4
KS     4
MS     4
UT     4
AR     4
NM     3
NE     3
RI     2
ME     2
WV     2
HI     2
ID     2
NH     2
MT     2
VT     1
AK     1
SD     1
ND     1
DE     1
WY     1
Name: STATE, dtype: int64


In [6]:
# Looking for 330759736 (population of all 50 states)
sum(cong_totals["P0010001"])

330759736

In [7]:
# Check States with large differences
# Some of these states didn't exactly follow +/- 1 deviation or used adjusted data
def check_max_min(joined_cong):
    for val in list(joined_cong["STATE"].unique()):
        '''print(val)
        print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("")'''
        #print("REPORT")
        if abs(max(joined_cong[joined_cong["STATE"]==val]["P0010001"]) - min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))>10:
            print(val)
            print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("")

check_max_min(cong_totals)

AR
MAX: 753219
MIN: 752509

CA
MAX: 782247
MIN: 754875

CO
MAX: 721794
MIN: 721664

HI
MAX: 728876
MIN: 726395

IA
MAX: 797645
MIN: 797551

LA
MAX: 776333
MIN: 776268

MD
MAX: 777845
MIN: 767247

MI
MAX: 775666
MIN: 774544

NE
MAX: 653847
MIN: 653822

NJ
MAX: 779056
MIN: 771744

NM
MAX: 705846
MIN: 705832

NV
MAX: 778140
MIN: 773758

RI
MAX: 549301
MIN: 548078

VA
MAX: 788614
MIN: 779587

WA
MAX: 774871
MIN: 768710

WV
MAX: 897649
MIN: 896067



## Export File

In [8]:
# Export to csv
cong_totals.to_csv("./cong_totals.csv", index = False)

## 2022 Districts using Adjusted Data - Populations  10/18/2022 

### Background:
- We received a data request asking for total populations of the 2022 districts.
- Although most states draw their redistricting plans using the census' population, a handful of states use adjusted data.
- The usage of adjusted data is made more complicated by the fact that not every state that produces adjusted data uses it for all levels of redistricting.
- Below is a list of the states that produce adjusted data and the level(s) of redistricting they use it for:
    - CA (Congressional and State Legislative)
    - CO (State Legislative)
    - CT (State Legislative)
    - DE (State Legislative)
    - HI (State Legislative)
    - MD (Congressional and State Legislative)
    - MT (State Legislative)
    - NJ (Congressional and State Legislative)
    - NV (Congressional and State Legislative)
    - NY (State Legislative)
    - PA (State Legislative)
    - RI (Congressional and State Legislative)
    - VA (Congressional and State Legislative)
    - WA (Congressional and State Legislative)
- Due to this nuance, we thought it would make sense to produce a dataset with the districts that used adjusted data and their adjusted population, rather than adding in an "adjusted population" column to the national block-assignment file, where the adjusted population would not be relevant in all cases.
- Furthermore, RI did not release block-level adjusted data, but they did release their district-level adjusted populations.

### Approach:
- For every state on the above list, except RI, load in files containing adjusted populations for each block.
  - Note, these files were produced for earlier work and involve manipulating states' adjusted datasets
- Join the adjusted block-level populations to the national block assignment file.
- Transform the block assignment file so that every row is now a particular congressional, state house, or state senate district with its population
- For RI, transcribe the district populations from an official report and join these populations to the relevant districts.
- Check the file
- Export the file   

### Links to Download Raw Files
- RI District Population Reports
  - Official state reports, available upon requests.   
- RI District Population csv
  - Created by the RDH using the reports, available upon request. 
- State Block-Level Adjusted Populations for all States except RI (where data not available)
  - Produced using official files on the RDH website, processed files available upon request.   
- [National BAF for 2022 Districts](https://redistrictingdatahub.org/dataset/national-block-assignment-file-for-2022-state-legislative-and-congressional-districts/)
    
### Processing Steps
- See attached notebook

#### Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org

In [9]:
# Load in the national BAF
national_baf = pd.read_csv("./raw-from-source/national_baf_boundary/national_baf_boundary.csv", dtype =({"GEOID20":str, "STATEAB":str, "CONG":str, "SLDU":str, "SLDL":str, "FLOTERIAL":str}))

# Create columns for the various districts
national_baf["UNQ_CONG_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["CONG"].astype(str)
national_baf["UNQ_SLDL_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["SLDL"].astype(str)
national_baf["UNQ_SLDU_DIST_ID"] = national_baf["STATE"] + "-" + national_baf["SLDU"].astype(str)

In [10]:
# Clean the GEOID column
national_baf["GEOID20"] = national_baf["GEOID20"].astype(str).str.zfill(16)

In [11]:
adjusted_data_state_subset = ['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NJ', 'NV',
'NY', 'PA', 'VA', 'WA']

def mod_census(block_id):
    block_id = str(block_id)
    
    # PA appends a letter to the GEOID for the split blocks
    if "A" in block_id or "B" in block_id or "C" in block_id:
        
        # Return the GEOID with out the split so the blocks can be combined
        return block_id[:len(block_id)-1]
    
    # If it's not one of these special blocks, just return the GEOID
    else:
        return block_id

In [12]:
# Create a list to store the state data
adjusted_data_list = []

# Iterate over the states
for state in adjusted_data_state_subset:
    
    # Load and filter the data
    adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")
    adj_state = adj_state[["GEOID20", "Adj_Pop"]]
    
    # Deal with PA split blocks
    if state == "PA":
        
        # Use the above function to return the "unsplit" GEOID
        adj_state["mod_GEOID20"] = adj_state["GEOID20"].apply(lambda x: mod_census(x))
        
        # Because the splits blocks are in the same districts, we can join them together to match PL geographies
        adj_state_mod = adj_state.groupby("mod_GEOID20").sum()
        
        # Clean the index and rename columns to match others
        adj_state_mod.reset_index(drop = False, inplace = True)
        adj_state_mod.rename(columns = {"mod_GEOID20":"GEOID20"}, inplace = True)
        adj_state_mod = adj_state_mod[["GEOID20", "Adj_Pop"]]
        
        # Append the PA data to the list
        adjusted_data_list.append(adj_state_mod)
    
    # For other states just add the data
    else:
        adjusted_data_list.append(adj_state)

  adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_state["mod_GEOID20"] = adj_state["GEOID20"].apply(lambda x: mod_census(x))


In [13]:
# Transform the list to the dataframe
adj_state_data_df = pd.concat(adjusted_data_list)

# Clean the columns
adj_state_data_df["Adj_Pop"] = adj_state_data_df["Adj_Pop"].astype(int)
adj_state_data_df["GEOID20"] = adj_state_data_df["GEOID20"].astype(str).str.zfill(16)

In [14]:
# Join the two files together
adjusted_counts = pd.merge(national_baf, adj_state_data_df, how = "outer", on = "GEOID20", indicator = True)

# Check the join
adjusted_counts["_merge"].value_counts()

left_only     6067019
both          2059937
right_only          0
Name: _merge, dtype: int64

In [15]:
# Confirm that there is no population for any of the unjoined blocks
sum(adjusted_counts[adjusted_counts["_merge"]=="right_only"]["Adj_Pop"])

0

In [16]:
# Filter down to join blocks or blocks in RI (need that to get the district data for the state)
joined = adjusted_counts[(adjusted_counts["_merge"]=="both") | (adjusted_counts["STATE"]=="RI")]

# Clean the columns
joined["Adj_Pop"] = joined["Adj_Pop"].fillna(0)
joined["Adj_Pop"] = joined["Adj_Pop"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_Pop"] = joined["Adj_Pop"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_Pop"] = joined["Adj_Pop"].astype(int)


In [17]:
len(joined["STATE"].unique())

14

In [18]:
joined["STATE"].unique()

array(['MD', 'RI', 'VA', 'NJ', 'CA', 'CT', 'PA', 'WA', 'NY', 'NV', 'HI',
       'DE', 'CO', 'MT'], dtype=object)

In [19]:
# Create a subset of states that use adjusted data for congressional redistricting
uses_cong = joined[joined["STATE"].isin(["CA", "MD", "NJ", "NV", "RI", "VA", "WA"])]

# Aggregate to the appropriate district levels
joined_cong = uses_cong.groupby("UNQ_CONG_DIST_ID").sum()
joined_sldl = joined.groupby("UNQ_SLDL_DIST_ID").sum()
joined_sldu = joined.groupby("UNQ_SLDU_DIST_ID").sum()

# Clean the aggregations
joined_cong.reset_index(inplace = True, drop = False)
joined_sldl.reset_index(inplace = True, drop = False)
joined_sldu.reset_index(inplace = True, drop = False)

joined_cong.drop(["P0010001"], axis = 1, inplace = True)
joined_sldl.drop(["P0010001"], axis = 1, inplace = True)
joined_sldu.drop(["P0010001"], axis = 1, inplace = True)

joined_cong.columns = ["ID", "Adj_Pop"]
joined_sldl.columns = ["ID", "Adj_Pop"]
joined_sldu.columns = ["ID", "Adj_Pop"]

joined_cong["Level"] = "CONG"
joined_sldl["Level"] = "SLDL"
joined_sldu["Level"] = "SLDU"

# Join them back into one file
combined_files = pd.concat([joined_cong, joined_sldl, joined_sldu])

# Get the state abbreviation
combined_files["State"] = combined_files["ID"].apply(lambda x: x[0:2])

In [20]:
# Create an ID of the level and the ID so we can join to RI
combined_files["unique_id"] = combined_files["Level"]+"-"+combined_files["ID"]

In [21]:
# Load in the RI data
ri_data = pd.read_csv("./raw-from-source/ri_sizes.csv",dtype={"Number":str, "Adj_Pop":int, "Level":str})

# Create a unique ID to join with the pop. file
ri_data["unique_id"] = ri_data["Level"]+"-RI-"+ri_data["Number"].astype(str)

# Make the population an integer
ri_data["Adj_Pop"] = ri_data["Adj_Pop"].astype(int)

# Create a dictionary mapping from district ID to population in RI
ri_data_dict = dict(zip(ri_data["unique_id"], ri_data["Adj_Pop"]))

# Apply the above dictionary to the RI districts in the combined file
combined_files["Adj_Pop"] = combined_files["unique_id"].map(ri_data_dict).fillna(combined_files["Adj_Pop"])

In [22]:
combined_files["state-level"] = combined_files["State"]+"-"+combined_files["Level"]


In [23]:
combined_files["Adj_Pop"] = combined_files["Adj_Pop"].astype(int)


In [24]:
combined_files[combined_files["Adj_Pop"]==0]

Unnamed: 0,ID,Adj_Pop,Level,State,unique_id,state-level
296,CT-NO VALUE,0,SLDL,CT,SLDL-CT-NO VALUE,CT-SLDL
111,CT-NO VALUE,0,SLDU,CT,SLDU-CT-NO VALUE,CT-SLDU
467,RI-NO VALUE,0,SLDU,RI,SLDU-RI-NO VALUE,RI-SLDU


In [25]:
# Remove the "No Data" districts (these were unassigned blocks we kept in the BAF)
combined_files = combined_files[~combined_files["ID"].str.contains("NO")]

## Check On Its Own

In [26]:
# Check how many of each district type there is (correct numbers are below)
combined_files["state-level"].value_counts()

PA-SLDL    203
CT-SLDL    151
NY-SLDL    150
VA-SLDL    100
MT-SLDL    100
CA-SLDL     80
RI-SLDL     75
MD-SLDL     71
CO-SLDL     65
NY-SLDU     63
CA-CONG     52
HI-SLDL     51
PA-SLDU     50
MT-SLDU     50
WA-SLDL     49
WA-SLDU     49
MD-SLDU     47
NV-SLDL     42
DE-SLDL     41
VA-SLDU     40
CA-SLDU     40
NJ-SLDU     40
NJ-SLDL     40
RI-SLDU     38
CT-SLDU     36
CO-SLDU     35
HI-SLDU     25
DE-SLDU     21
NV-SLDU     21
NJ-CONG     12
VA-CONG     11
WA-CONG     10
MD-CONG      8
NV-CONG      4
RI-CONG      2
Name: state-level, dtype: int64

Target District Numbers

- PA-SLDL    203
- CT-SLDL    151
- NY-SLDL    150
- VA-SLDL    100
- MT-SLDL    100
- CA-SLDL     80
- RI-SLDL     75
- MD-SLDL     71
- CO-SLDL     65
- NY-SLDU     63
- CA-CONG     52
- HI-SLDL     51
- PA-SLDU     50
- MT-SLDU     50
- WA-SLDL     49
- WA-SLDU     49
- MD-SLDU     47
- NV-SLDL     42
- DE-SLDL     41
- VA-SLDU     40
- CA-SLDU     40
- NJ-SLDU     40
- NJ-SLDL     40
- RI-SLDU     38
- CT-SLDU     36
- CO-SLDU     35
- HI-SLDU     25
- DE-SLDU     21
- NV-SLDU     21
- NJ-CONG     12
- VA-CONG     11
- WA-CONG     10
- MD-CONG      8
- NV-CONG      4
- RI-CONG      2

In [27]:
# Check the population totals for the various district types
state_sums = combined_files.groupby("state-level").sum()
print(state_sums["Adj_Pop"])

state-level
CA-CONG    39523437
CA-SLDL    39523437
CA-SLDU    39523437
CO-SLDL     5773714
CO-SLDU     5773714
CT-SLDL     3603566
CT-SLDU     3603566
DE-SLDL      989598
DE-SLDU      989598
HI-SLDL     1383606
HI-SLDU     1383606
MD-CONG     6175403
MD-SLDL     6175403
MD-SLDU     6175403
MT-SLDL     1082717
MT-SLDU     1082717
NJ-CONG     9283016
NJ-SLDL     9283016
NJ-SLDU     9283016
NV-CONG     3104614
NV-SLDL     3104614
NV-SLDU     3104614
NY-SLDL    20193858
NY-SLDU    20193858
PA-SLDL    13002700
PA-SLDU    13002700
RI-CONG     1097379
RI-SLDL     1097379
RI-SLDU     1097379
VA-CONG     8631393
VA-SLDL     8631393
VA-SLDU     8631393
WA-CONG     7705281
WA-SLDL     7705281
WA-SLDU     7705281
Name: Adj_Pop, dtype: int64


Target Pops

- CA 39523437
- CO     5773714
- CT     3603566
- DE      989598
- HI    1383606
- MD    6175403
- MT    1082717
- NJ    9283016
- NV     3104614
- NY   20193858
- PA   13002700
- RI    1097379
- VA    8631393
- WA    7705281

## Final File Cleaning and Exporting

In [28]:
combined_files.drop(["unique_id","state-level"], axis = 1, inplace = True)
combined_files["ID"] = combined_files["ID"].apply(lambda x: x.split("-")[1])

combined_files_join = combined_files.copy(deep = True)

In [29]:
# Final Cleaning
combined_files.columns = ["DISTRICT", "ADJ_POP", "LEVEL", "STATE"]
combined_files = combined_files[["STATE", "DISTRICT", "LEVEL", "ADJ_POP"]]

In [30]:
combined_files = combined_files.sort_values(["STATE", "LEVEL", "DISTRICT"])

In [31]:
combined_files.to_csv("./adjusted_districts_pop.csv", index = False)

## Check against old

In [32]:
old_adjusted_pops = pd.read_csv("./raw-from-source/national_districts_adjusted_pop/adjusted_districts_pop.csv")

combined_files_join["DIST-ID"] = combined_files_join["State"] +"-"+combined_files_join["Level"] +"-"+  combined_files_join["ID"].astype(str).str.zfill(3)

In [33]:
old_adjusted_pops["DIST-ID"] = old_adjusted_pops["State"]+ "-" + old_adjusted_pops["Level"] + "-" + old_adjusted_pops["ID"].astype(str).astype(str).str.zfill(3)

In [34]:
joined_adjusted = pd.merge(combined_files_join, old_adjusted_pops, how = "outer", on = "DIST-ID", indicator = True)

In [35]:
joined_adjusted["_merge"].value_counts()

both          1872
left_only        0
right_only       0
Name: _merge, dtype: int64

In [36]:
joined_adjusted["Difference"] = joined_adjusted["Adj_Pop_x"] - joined_adjusted["Adj_Pop_y"]

In [37]:
joined_adjusted[joined_adjusted["Difference"]!=0]

Unnamed: 0,ID_x,Adj_Pop_x,Level_x,State_x,DIST-ID,ID_y,Adj_Pop_y,Level_y,State_y,_merge,Difference
418,30,24889,SLDL,DE,DE-SLDL-030,30,24882,SLDL,DE,both,7
421,33,23821,SLDL,DE,DE-SLDL-033,33,23828,SLDL,DE,both,-7
1429,10,47281,SLDU,DE,DE-SLDU-010,10,47345,SLDU,DE,both,-64
1430,11,48081,SLDU,DE,DE-SLDU-011,11,47647,SLDU,DE,both,434
1433,14,49253,SLDU,DE,DE-SLDU-014,14,49189,SLDU,DE,both,64
1434,15,47104,SLDU,DE,DE-SLDU-015,15,47131,SLDU,DE,both,-27
1436,17,49042,SLDU,DE,DE-SLDU-017,17,49015,SLDU,DE,both,27
1437,18,48592,SLDU,DE,DE-SLDU-018,18,48541,SLDU,DE,both,51
1438,19,48726,SLDU,DE,DE-SLDU-019,19,49105,SLDU,DE,both,-379
1440,20,49198,SLDU,DE,DE-SLDU-020,20,48961,SLDU,DE,both,237


In [38]:
sum(joined_adjusted[joined_adjusted["Difference"]!=0]["Difference"])

0

In [39]:
joined_adjusted[joined_adjusted["Difference"]!=0][["DIST-ID","Difference"]]

Unnamed: 0,DIST-ID,Difference
418,DE-SLDL-030,7
421,DE-SLDL-033,-7
1429,DE-SLDU-010,-64
1430,DE-SLDU-011,434
1433,DE-SLDU-014,64
1434,DE-SLDU-015,-27
1436,DE-SLDU-017,27
1437,DE-SLDU-018,51
1438,DE-SLDU-019,-379
1440,DE-SLDU-020,237
