In [1]:
import pandas as pd
import geopandas as gp

# Congressional District Population (Nationwide) + Districts using Adjusted Data Population

## 2022 Congressional Districts with Total Population from Census PL file 03/20/2023

### Background:
We received a data request asking for total populations of the 2022 congressional districts.

Note that the following states use adjusted census data when drawing congressional districts: CA, MD, NJ, NV, RI, VA, WA. For the purposes of calculating population deviations, the adjusted population totals (calculated below) should be used.

### Approach:

- Load National BAF file, which contains a 2020 Census PL Total Population field
- Groupby congressional district, and join to the national 2022 congressional file
- Check file  
- Export file

### Links to Download Raw Files 
- [National BAF for 118th Congressional Districts](https://redistrictingdatahub.org/dataset/118th-congressional-districts-nov-2022-election-active-jan-2023-jan-2025/)

### Processing Steps:
See attached notebook

**Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org for more info.

In [2]:
# Load in the new BAF file
updated_baf_boundary = pd.read_csv("./raw-from-source/national_cong118_boundary/national_cong118_baf.csv")

# Create a CD ID
updated_baf_boundary["CONG-ID"] = updated_baf_boundary["STATE"]+"-"+updated_baf_boundary["DISTRICT"].astype(str)

# Load in blocks and population data
national_pop = pd.read_csv("./raw-from-source/national_blocks_2020_pop.csv")

# Join the block population data with the baf 
baf_pop = pd.merge(national_pop, updated_baf_boundary, how = "outer", on = "GEOID20", indicator = True)

# Cast the population data to an int, filter out "NO VALUE" areas
baf_pop["P0010001"] = baf_pop["P0010001"].astype(int)
baf_pop = baf_pop[~baf_pop["CONG-ID"].str.contains("NO")]

# Groupby this ID
cong_totals = baf_pop.groupby(["CONG-ID"]).sum()

# Reset index, drop columns, cast pop data to integer and remove "NO VALUE" districts
cong_totals.reset_index(drop = False, inplace = True)

  updated_baf_boundary = pd.read_csv("./raw-from-source/national_cong118_boundary/national_cong118_baf.csv")
  cong_totals = baf_pop.groupby(["CONG-ID"]).sum()


## Clean File

In [3]:
# Clean the columns
cong_totals["STATE"] = cong_totals["CONG-ID"].apply(lambda x: x.split("-")[0])
cong_totals["DISTRICT"] = cong_totals["CONG-ID"].apply(lambda x: "-".join(x.split("-")[1:]))

# Filter the columns
cong_totals = cong_totals[["STATE","DISTRICT","CONG-ID","P0010001"]]

# Final population check
sum(cong_totals["P0010001"])

330759736

## Check Against Prior File

In [4]:
old_file = pd.read_csv("./raw-from-source/old_file/national_cong_pop_2022/national_cong_pop_2022.csv")

cong_totals.sort_values("CONG-ID", inplace = True)
old_file.sort_values("CONG-ID", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cong_totals.sort_values("CONG-ID", inplace = True)


In [5]:
cong_totals.equals(old_file)

True

## Check File On Its Own

Note: Our old file passed these tests and is identical to the new file

In [6]:
# Can compare these numbers against this link: https://www.census.gov/library/visualizations/2021/dec/2020-apportionment-map.html
print(cong_totals["STATE"].value_counts())

CA    52
TX    38
FL    28
NY    26
IL    17
PA    17
OH    15
GA    14
NC    14
MI    13
NJ    12
VA    11
WA    10
TN     9
IN     9
AZ     9
MA     9
WI     8
MO     8
MN     8
CO     8
MD     8
SC     7
AL     7
OR     6
LA     6
KY     6
OK     5
CT     5
IA     4
NV     4
KS     4
MS     4
UT     4
AR     4
NM     3
NE     3
RI     2
ME     2
WV     2
HI     2
ID     2
NH     2
MT     2
VT     1
AK     1
SD     1
ND     1
DE     1
WY     1
Name: STATE, dtype: int64


In [7]:
# Looking for 330759736 (population of all 50 states)
sum(cong_totals["P0010001"])

330759736

In [8]:
# Check States with large differences
# Some of these states didn't exactly follow +/- 1 deviation or used adjusted data
def check_max_min(joined_cong):
    for val in list(joined_cong["STATE"].unique()):
        '''print(val)
        print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("")'''
        #print("REPORT")
        if abs(max(joined_cong[joined_cong["STATE"]==val]["P0010001"]) - min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))>10:
            print(val)
            print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("")

check_max_min(cong_totals)

AR
MAX: 753219
MIN: 752509

CA
MAX: 782247
MIN: 754875

CO
MAX: 721794
MIN: 721664

HI
MAX: 728876
MIN: 726395

IA
MAX: 797645
MIN: 797551

LA
MAX: 776333
MIN: 776268

MD
MAX: 777845
MIN: 767247

MI
MAX: 775666
MIN: 774544

NE
MAX: 653847
MIN: 653822

NJ
MAX: 779056
MIN: 771744

NM
MAX: 705846
MIN: 705832

NV
MAX: 778140
MIN: 773758

RI
MAX: 549301
MIN: 548078

VA
MAX: 788614
MIN: 779587

WA
MAX: 774871
MIN: 768710

WV
MAX: 897649
MIN: 896067



## Export File

In [9]:
# Export to csv
cong_totals.to_csv("./cong_totals.csv", index = False)

## 2022 Districts using Adjusted Data - Populations  03/20/23

### Background:
- We received a data request asking for total populations of the 2022 districts.
- Although most states draw their redistricting plans using the census' population, a handful of states use adjusted data.
- The usage of adjusted data is made more complicated by the fact that not every state that produces adjusted data uses it for all levels of redistricting.
- Below is a list of the states that produce adjusted data and the level(s) of redistricting they use it for:
    - CA (Congressional and State Legislative)
    - CO (State Legislative)
    - CT (State Legislative)
    - DE (State Legislative)
    - HI (State Legislative)
    - MD (Congressional and State Legislative)
    - MT (State Legislative)
    - NJ (Congressional and State Legislative)
    - NV (Congressional and State Legislative)
    - NY (State Legislative)
    - PA (State Legislative)
    - RI (Congressional and State Legislative)
    - VA (Congressional and State Legislative)
    - WA (Congressional and State Legislative)
- Due to this nuance, we thought it would make sense to produce a dataset with the districts that used adjusted data and their adjusted population, rather than adding in an "adjusted population" column to the national block-assignment file, where the adjusted population would not be relevant in all cases.
- Furthermore, RI did not release block-level adjusted data, but they did release their district-level adjusted populations.

### Approach:
- For every state on the above list, except RI, load in files containing adjusted populations for each block.
  - Note, these files were produced for earlier work and involve manipulating states' adjusted datasets
- Join the adjusted block-level populations to the national block assignment file.
- Transform the block assignment file so that every row is now a particular congressional, state house, or state senate district with its population
- For RI, transcribe the district populations from an official report and join these populations to the relevant districts.
- Check the file
- Export the file   

### Links to Download Raw Files
- RI District Population Reports
  - Official state reports, available upon requests.   
- RI District Population csv
  - Created by the RDH using the reports, available upon request. 
- State Block-Level Adjusted Populations for all States except RI (where data not available)
  - Produced using official files on the RDH website, processed files available upon request.   
- [National BAF for Post-Redistricting State Legislative Districts](https://redistrictingdatahub.org/dataset/national-post-redistricting-2021-cycle-state-legislative-district-boundaries-and-block-assignment-file/)
    
### Processing Steps
- See attached notebook

#### Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org

In [10]:
# Load in the national BAF
national_baf_st_leg = pd.read_csv("./raw-from-source/national_post_2021_redistricting_st_leg_boundaries/national_post_2021_redistricting_st_leg_boundaries_baf.csv", dtype = str)
national_baf_cong = pd.read_csv("./raw-from-source/national_cong118_boundary/national_cong118_baf.csv", dtype = str)



In [11]:
# Create columns for the various districts
national_baf_cong["UNQ_CONG_DIST_ID"] = national_baf_cong["STATE"] + "-" + national_baf_cong["DISTRICT"].astype(str)
national_baf_st_leg["UNQ_SLDL_DIST_ID"] = national_baf_st_leg["STATE"] + "-" + national_baf_st_leg["SLDL"].astype(str)
national_baf_st_leg["UNQ_SLDU_DIST_ID"] = national_baf_st_leg["STATE"] + "-" + national_baf_st_leg["SLDU"].astype(str)

# Clean the GEOID column
national_baf_cong["GEOID20"] = national_baf_cong["GEOID20"].astype(str).str.zfill(15)
national_baf_st_leg["GEOID20"] = national_baf_st_leg["GEOID20"].astype(str).str.zfill(15)

In [12]:
national_baf_joined = pd.merge(national_baf_st_leg, national_baf_cong, how = "outer", on = "GEOID20", indicator = True)

In [13]:
national_baf_joined

Unnamed: 0,GEOID20,STATE_x,SLDU,SLDL,FLOTERIAL,UNQ_SLDL_DIST_ID,UNQ_SLDU_DIST_ID,CONG118,STATE_y,DISTRICT,UNQ_CONG_DIST_ID,_merge
0,010010201001000,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,both
1,010010201001001,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,both
2,010010201001002,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,both
3,010010201001003,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,both
4,010010201001004,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,both
...,...,...,...,...,...,...,...,...,...,...,...,...
8126951,121339703032149,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,both
8126952,121339703032150,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,both
8126953,121339703032151,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,both
8126954,121339703032152,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,both


In [14]:
national_baf_joined.drop(["_merge"], inplace = True, axis = 1)

In [15]:
adjusted_data_state_subset = ['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NJ', 'NV',
'NY', 'PA', 'VA', 'WA']

def mod_census(block_id):
    block_id = str(block_id)
    
    # PA appends a letter to the GEOID for the split blocks
    if "A" in block_id or "B" in block_id or "C" in block_id:
        
        # Return the GEOID with out the split so the blocks can be combined
        return block_id[:len(block_id)-1]
    
    # If it's not one of these special blocks, just return the GEOID
    else:
        return block_id

In [16]:
# Create a list to store the state data
adjusted_data_list = []

# Iterate over the states
for state in adjusted_data_state_subset:
    
    if state == "HI":
        adj_state = pd.read_csv("./raw-from-source/HI_blocks.csv")
        adj_state.rename(columns = {"Adj_Pop":"Adj_TotPop"}, inplace = True)
        
    else:
        # Load and filter the data
        adj_state = pd.read_csv("./raw-from-source/final_block_adjusted_data_csv/"+state.lower()+"_pl_2020_adj_block/"+state.lower()+"_pl_2020_adj_block.csv")
        adj_state = adj_state[["GEOID20", "Adj_TotPop"]]
    
    adjusted_data_list.append(adj_state)

In [17]:
# Transform the list to the dataframe
adj_state_data_df = pd.concat(adjusted_data_list)

# Clean the columns
adj_state_data_df["Adj_TotPop"] = adj_state_data_df["Adj_TotPop"].astype(int)
adj_state_data_df["GEOID20"] = adj_state_data_df["GEOID20"].astype(str).str.zfill(15)

In [18]:
national_baf_joined["GEOID20"] = national_baf_joined["GEOID20"].astype(str).str.zfill(15)

In [19]:
# Join the two files together
adjusted_counts = pd.merge(national_baf_joined, adj_state_data_df, how = "outer", on = "GEOID20", indicator = True)

# Check the join
adjusted_counts["_merge"].value_counts()

left_only     6067019
both          2059937
right_only          0
Name: _merge, dtype: int64

In [20]:
# Confirm that there is no population for any of the unjoined blocks
sum(adjusted_counts[adjusted_counts["_merge"]=="right_only"]["Adj_TotPop"])

0

In [21]:
adjusted_counts

Unnamed: 0,GEOID20,STATE_x,SLDU,SLDL,FLOTERIAL,UNQ_SLDL_DIST_ID,UNQ_SLDU_DIST_ID,CONG118,STATE_y,DISTRICT,UNQ_CONG_DIST_ID,Adj_TotPop,_merge
0,010010201001000,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,,left_only
1,010010201001001,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,,left_only
2,010010201001002,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,,left_only
3,010010201001003,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,,left_only
4,010010201001004,AL,30,69,NO FLOTERIAL,AL-69,AL-30,AL-2,AL,2,AL-2,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8126951,121339703032149,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,,left_only
8126952,121339703032150,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,,left_only
8126953,121339703032151,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,,left_only
8126954,121339703032152,FL,2,5,NO FLOTERIAL,FL-5,FL-2,FL-2,FL,2,FL-2,,left_only


In [22]:
# Filter down to join blocks or blocks in RI (need that to get the district data for the state)
joined = adjusted_counts[(adjusted_counts["_merge"]=="both") | (adjusted_counts["STATE_x"]=="RI")]

# Clean the columns
joined["Adj_TotPop"] = joined["Adj_TotPop"].fillna(0)
joined["Adj_TotPop"] = joined["Adj_TotPop"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_TotPop"] = joined["Adj_TotPop"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_TotPop"] = joined["Adj_TotPop"].astype(int)


In [23]:
len(joined["STATE_x"].unique())

14

In [24]:
joined["STATE_x"].unique()

array(['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NV', 'NJ', 'NY', 'PA',
       'RI', 'VA', 'WA'], dtype=object)

In [25]:
# Create a subset of states that use adjusted data for congressional redistricting
uses_cong = joined[joined["STATE_x"].isin(["CA", "MD", "NJ", "NV", "RI", "VA", "WA"])]

# Aggregate to the appropriate district levels
joined_cong = uses_cong.groupby("UNQ_CONG_DIST_ID").sum()
joined_sldl = joined.groupby("UNQ_SLDL_DIST_ID").sum()
joined_sldu = joined.groupby("UNQ_SLDU_DIST_ID").sum()

# Clean the aggregations
joined_cong.reset_index(inplace = True, drop = False)
joined_sldl.reset_index(inplace = True, drop = False)
joined_sldu.reset_index(inplace = True, drop = False)

# joined_cong.drop(["P0010001"], axis = 1, inplace = True)
# joined_sldl.drop(["P0010001"], axis = 1, inplace = True)
# joined_sldu.drop(["P0010001"], axis = 1, inplace = True)

joined_cong.columns = ["ID", "Adj_TotPop"]
joined_sldl.columns = ["ID", "Adj_TotPop"]
joined_sldu.columns = ["ID", "Adj_TotPop"]

joined_cong["Level"] = "CONG"
joined_sldl["Level"] = "SLDL"
joined_sldu["Level"] = "SLDU"

# Join them back into one file
combined_files = pd.concat([joined_cong, joined_sldl, joined_sldu])

# Get the state abbreviation
combined_files["State"] = combined_files["ID"].apply(lambda x: x[0:2])

  joined_cong = uses_cong.groupby("UNQ_CONG_DIST_ID").sum()
  joined_sldl = joined.groupby("UNQ_SLDL_DIST_ID").sum()
  joined_sldu = joined.groupby("UNQ_SLDU_DIST_ID").sum()


In [26]:
# Create an ID of the level and the ID so we can join to RI
combined_files["unique_id"] = combined_files["Level"]+"-"+combined_files["ID"]

In [27]:
# Load in the RI data
ri_data = pd.read_csv("./raw-from-source/ri_sizes.csv",dtype={"Number":str, "Adj_Pop":int, "Level":str})

# Create a unique ID to join with the pop. file
ri_data["unique_id"] = ri_data["Level"]+"-RI-"+ri_data["Number"].astype(str)

# Make the population an integer
ri_data["Adj_Pop"] = ri_data["Adj_Pop"].astype(int)

# Create a dictionary mapping from district ID to population in RI
ri_data_dict = dict(zip(ri_data["unique_id"], ri_data["Adj_Pop"]))

# Apply the above dictionary to the RI districts in the combined file
combined_files["Adj_TotPop"] = combined_files["unique_id"].map(ri_data_dict).fillna(combined_files["Adj_TotPop"])

In [28]:
combined_files["state-level"] = combined_files["State"]+"-"+combined_files["Level"]


In [29]:
combined_files["Adj_TotPop"] = combined_files["Adj_TotPop"].astype(int)


In [30]:
combined_files[combined_files["Adj_TotPop"]==0]

Unnamed: 0,ID,Adj_TotPop,Level,State,unique_id,state-level
296,CT-NO VALUE,0,SLDL,CT,SLDL-CT-NO VALUE,CT-SLDL
111,CT-NO VALUE,0,SLDU,CT,SLDU-CT-NO VALUE,CT-SLDU


In [31]:
# Remove the "No Data" districts (these were unassigned blocks we kept in the BAF)
combined_files = combined_files[~combined_files["ID"].str.contains("NO")]

## Check On Its Own

In [32]:
# Check how many of each district type there is (correct numbers are below)
combined_files["state-level"].value_counts()

PA-SLDL    203
CT-SLDL    151
NY-SLDL    150
VA-SLDL    100
MT-SLDL    100
CA-SLDL     80
RI-SLDL     75
MD-SLDL     71
CO-SLDL     65
NY-SLDU     63
CA-CONG     52
HI-SLDL     51
PA-SLDU     50
MT-SLDU     50
WA-SLDL     49
WA-SLDU     49
MD-SLDU     47
NV-SLDL     42
DE-SLDL     41
VA-SLDU     40
CA-SLDU     40
NJ-SLDU     40
NJ-SLDL     40
RI-SLDU     38
CT-SLDU     36
CO-SLDU     35
HI-SLDU     25
DE-SLDU     21
NV-SLDU     21
NJ-CONG     12
VA-CONG     11
WA-CONG     10
MD-CONG      8
NV-CONG      4
RI-CONG      2
Name: state-level, dtype: int64

Target District Numbers

- PA-SLDL    203
- CT-SLDL    151
- NY-SLDL    150
- VA-SLDL    100
- MT-SLDL    100
- CA-SLDL     80
- RI-SLDL     75
- MD-SLDL     71
- CO-SLDL     65
- NY-SLDU     63
- CA-CONG     52
- HI-SLDL     51
- PA-SLDU     50
- MT-SLDU     50
- WA-SLDL     49
- WA-SLDU     49
- MD-SLDU     47
- NV-SLDL     42
- DE-SLDL     41
- VA-SLDU     40
- CA-SLDU     40
- NJ-SLDU     40
- NJ-SLDL     40
- RI-SLDU     38
- CT-SLDU     36
- CO-SLDU     35
- HI-SLDU     25
- DE-SLDU     21
- NV-SLDU     21
- NJ-CONG     12
- VA-CONG     11
- WA-CONG     10
- MD-CONG      8
- NV-CONG      4
- RI-CONG      2

In [33]:
# Check the population totals for the various district types
state_sums = combined_files.groupby("state-level").sum()
print(state_sums["Adj_TotPop"])

state-level
CA-CONG    39523437
CA-SLDL    39523437
CA-SLDU    39523437
CO-SLDL     5773714
CO-SLDU     5773714
CT-SLDL     3603566
CT-SLDU     3603566
DE-SLDL      989598
DE-SLDU      989598
HI-SLDL     1383606
HI-SLDU     1383606
MD-CONG     6175403
MD-SLDL     6175403
MD-SLDU     6175403
MT-SLDL     1082717
MT-SLDU     1082717
NJ-CONG     9283016
NJ-SLDL     9283016
NJ-SLDU     9283016
NV-CONG     3104614
NV-SLDL     3104614
NV-SLDU     3104614
NY-SLDL    20193858
NY-SLDU    20193858
PA-SLDL    13002700
PA-SLDU    13002700
RI-CONG     1097379
RI-SLDL     1097379
RI-SLDU     1097379
VA-CONG     8631393
VA-SLDL     8631393
VA-SLDU     8631393
WA-CONG     7705281
WA-SLDL     7705281
WA-SLDU     7705281
Name: Adj_TotPop, dtype: int64


  state_sums = combined_files.groupby("state-level").sum()


Target Pops

- CA 39523437
- CO     5773714
- CT     3603566
- DE      989598
- HI    1383606
- MD    6175403
- MT    1082717
- NJ    9283016
- NV     3104614
- NY   20193858
- PA   13002700
- RI    1097379
- VA    8631393
- WA    7705281

## Final File Cleaning and Exporting

In [34]:
combined_files

Unnamed: 0,ID,Adj_TotPop,Level,State,unique_id,state-level
0,CA-1,760066,CONG,CA,CONG-CA-1,CA-CONG
1,CA-10,760066,CONG,CA,CONG-CA-10,CA-CONG
2,CA-11,760067,CONG,CA,CONG-CA-11,CA-CONG
3,CA-12,760065,CONG,CA,CONG-CA-12,CA-CONG
4,CA-13,760065,CONG,CA,CONG-CA-13,CA-CONG
...,...,...,...,...,...,...
551,WA-5,157289,SLDU,WA,SLDU-WA-5,WA-SLDU
552,WA-6,157252,SLDU,WA,SLDU-WA-6,WA-SLDU
553,WA-7,157250,SLDU,WA,SLDU-WA-7,WA-SLDU
554,WA-8,157266,SLDU,WA,SLDU-WA-8,WA-SLDU


In [35]:
combined_files.drop(["unique_id","state-level"], axis = 1, inplace = True)
combined_files["ID"] = combined_files["ID"].apply(lambda x: x.split("-")[1])

combined_files_join = combined_files.copy(deep = True)

In [36]:
combined_files

Unnamed: 0,ID,Adj_TotPop,Level,State
0,1,760066,CONG,CA
1,10,760066,CONG,CA
2,11,760067,CONG,CA
3,12,760065,CONG,CA
4,13,760065,CONG,CA
...,...,...,...,...
551,5,157289,SLDU,WA
552,6,157252,SLDU,WA
553,7,157250,SLDU,WA
554,8,157266,SLDU,WA


In [37]:
# Final Cleaning
combined_files.columns = ["DISTRICT", "ADJ_POP", "LEVEL", "STATE"]
combined_files = combined_files[["STATE", "DISTRICT", "LEVEL", "ADJ_POP"]]

In [38]:
combined_files = combined_files.sort_values(["STATE", "LEVEL", "DISTRICT"])

In [40]:
combined_files.to_csv("./adjusted_districts_pop.csv", index = False)

## Check against old

In [41]:
old_adjusted_pops = pd.read_csv("./raw-from-source/old_file/districts_adj_pop_2022_csv/districts_adj_pop_2022_csv.csv")

combined_files_join["DIST-ID"] = combined_files_join["State"] +"-"+combined_files_join["Level"] +"-"+  combined_files_join["ID"].astype(str).str.zfill(3)

In [42]:
old_adjusted_pops

Unnamed: 0,STATE,DISTRICT,LEVEL,ADJ_POP
0,CA,1,CONG,760066
1,CA,10,CONG,760066
2,CA,11,CONG,760067
3,CA,12,CONG,760065
4,CA,13,CONG,760065
...,...,...,...,...
1867,WA,5,SLDU,157289
1868,WA,6,SLDU,157252
1869,WA,7,SLDU,157250
1870,WA,8,SLDU,157266


In [43]:
old_adjusted_pops["DIST-ID"] = old_adjusted_pops["STATE"]+ "-" + old_adjusted_pops["LEVEL"] + "-" + old_adjusted_pops["DISTRICT"].astype(str).astype(str).str.zfill(3)

In [44]:
joined_adjusted = pd.merge(combined_files_join, old_adjusted_pops, how = "outer", on = "DIST-ID", indicator = True)

In [45]:
joined_adjusted["_merge"].value_counts()

both          1872
left_only        0
right_only       0
Name: _merge, dtype: int64

In [46]:
combined_files_join

Unnamed: 0,ID,Adj_TotPop,Level,State,DIST-ID
0,1,760066,CONG,CA,CA-CONG-001
1,10,760066,CONG,CA,CA-CONG-010
2,11,760067,CONG,CA,CA-CONG-011
3,12,760065,CONG,CA,CA-CONG-012
4,13,760065,CONG,CA,CA-CONG-013
...,...,...,...,...,...
551,5,157289,SLDU,WA,WA-SLDU-005
552,6,157252,SLDU,WA,WA-SLDU-006
553,7,157250,SLDU,WA,WA-SLDU-007
554,8,157266,SLDU,WA,WA-SLDU-008


In [47]:
joined_adjusted["Difference"] = joined_adjusted["ADJ_POP"] - joined_adjusted["Adj_TotPop"]

In [48]:
joined_adjusted[(joined_adjusted["Difference"]!=0)&(joined_adjusted["STATE"]=="DE")]

Unnamed: 0,ID,Adj_TotPop,Level,State,DIST-ID,STATE,DISTRICT,LEVEL,ADJ_POP,_merge,Difference
1430,11,48494,SLDU,DE,DE-SLDU-011,DE,11,SLDU,48081,both,-413
1448,9,44989,SLDU,DE,DE-SLDU-009,DE,9,SLDU,45402,both,413


In [49]:
sum(joined_adjusted[joined_adjusted["Difference"]!=0]["Difference"])

0

In [50]:
joined_adjusted[joined_adjusted["Difference"]!=0][["DIST-ID","Difference"]]

Unnamed: 0,DIST-ID,Difference
558,MT-SLDL-001,-1312
559,MT-SLDL-010,705
560,MT-SLDL-100,-371
561,MT-SLDL-011,340
562,MT-SLDL-012,492
...,...,...
1566,MT-SLDU-050,-851
1567,MT-SLDU-006,832
1568,MT-SLDU-007,104
1569,MT-SLDU-008,-2287
