In [125]:
import pandas as pd
import geopandas as gp
import os

# Districts Drawn w/ Adjusted Data - Total Populations

- Note: The states and geographies that use adjusted data:
    - CA (Congressional and State Legislative)
    - CO (State Legislative)
    - CT (State Legislative)
    - DE (State Legislative)
    - MD (Congressional and State Legislative)
    - MT (State Legislative)
    - NJ (Congressional and State Legislative)
    - NV (Congressional and State Legislative)
    - NY (State Legislative)
    - RI (Congressional and State Legislative)
    - VA (Congressional and State Legislative)
    - WA (Congressional and State Legislative)
    
- Note: LA draws redistricting on an adjusted dataset, but its adjustment do not change any population numbers

## Load in Data for Every State Except RI

Note: These are processed files created for work for the "States that Adjust the Census Data for Redistricting" page on the RDH website. HI and PA need to be loaded separately

In [126]:
adjusted_data_state_subset = ['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NJ', 'NV',
'NY', 'PA', 'VA', 'WA']

In [127]:
def mod_census(block_id):
    block_id = str(block_id)
    if "A" in block_id or "B" in block_id or "C" in block_id:
        return block_id[:len(block_id)-1]
    else:
        return block_id

In [128]:
adjusted_data_list = []

for state in adjusted_data_state_subset:
    adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")
    adj_state = adj_state[["GEOID20", "Adj_Pop"]]
    
    if state == "PA":
        adj_state["mod_GEOID20"] = adj_state["GEOID20"].apply(lambda x: mod_census(x))

        adj_state["mod_GEOID20"].value_counts(dropna = False)

        adj_state_mod = adj_state.groupby("mod_GEOID20").sum()

        adj_state_mod.reset_index(drop = False, inplace = True)

        adj_state_mod.rename(columns = {"mod_GEOID20":"GEOID20"}, inplace = True)

        adj_state_mod = adj_state_mod[["GEOID20", "Adj_Pop"]]
        adjusted_data_list.append(adj_state_mod)
    
    else:
        adjusted_data_list.append(adj_state)
    

  adj_state = pd.read_csv("./raw-from-source/Adjusted_Counts/"+state+"_blocks.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_state["mod_GEOID20"] = adj_state["GEOID20"].apply(lambda x: mod_census(x))


In [129]:
adj_state_data_df = pd.concat(adjusted_data_list)

adj_state_data_df["Adj_Pop"] = adj_state_data_df["Adj_Pop"].astype(int)
adj_state_data_df["GEOID20"] = adj_state_data_df["GEOID20"].astype(str).str.zfill(16)

# If desired, export this data to csv
adj_state_data_df.to_csv("./adjusted_data_pops.csv", index = False)

## Load in Block Assignment File

In [130]:
national_baf = pd.read_csv("/Users/peterhorton/Downloads/national_baf/national_baf.csv", dtype =({"GEOID20":str, "STATEAB":str, "CONG":str, "SLDU":str, "SLDL":str, "FLOTERIAL":str}))

national_baf["UNQ_CONG_DIST_ID"] = national_baf["STATEAB"] + "-" + national_baf["CONG"].astype(str)
national_baf["UNQ_SLDL_DIST_ID"] = national_baf["STATEAB"] + "-" + national_baf["SLDL"].astype(str)
national_baf["UNQ_SLDU_DIST_ID"] = national_baf["STATEAB"] + "-" + national_baf["SLDU"].astype(str)

In [131]:
len(national_baf[(national_baf["STATEAB"]=="PA")]["SLDU"].unique())

50

In [132]:
national_baf["GEOID20"] = national_baf["GEOID20"].astype(str).str.zfill(16)

## Join to National BAF

Note: This file is available for download from the RDH website (https://redistrictingdatahub.org/dataset/national-block-assignment-file-for-2022-state-legislative-and-congressional-districts/)

In [133]:
adjusted_counts = pd.merge(national_baf, adj_state_data_df, how = "outer", on = "GEOID20", indicator = True)

In [134]:
adjusted_counts["_merge"].value_counts()

left_only     6066848
both          2059864
right_only         73
Name: _merge, dtype: int64

In [135]:
adjusted_counts["_merge"]

0           left_only
1           left_only
2           left_only
3           left_only
4           left_only
              ...    
8126780    right_only
8126781    right_only
8126782    right_only
8126783    right_only
8126784    right_only
Name: _merge, Length: 8126785, dtype: category
Categories (3, object): ['left_only', 'right_only', 'both']

In [136]:
sum(adjusted_counts[adjusted_counts["_merge"]=="right_only"]["Adj_Pop"])

0.0

In [137]:
joined = adjusted_counts[(adjusted_counts["_merge"]=="both") | (adjusted_counts["STATEAB"]=="RI")]

In [138]:
joined["Adj_Pop"] = joined["Adj_Pop"].fillna(0)
joined["Adj_Pop"] = joined["Adj_Pop"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_Pop"] = joined["Adj_Pop"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined["Adj_Pop"] = joined["Adj_Pop"].astype(int)


## Aggregate to Districts

In [139]:
joined["STATEAB"].unique()

array(['CA', 'CO', 'CT', 'DE', 'HI', 'MD', 'MT', 'NJ', 'NV', 'NY', 'PA',
       'RI', 'VA', 'WA'], dtype=object)

In [140]:
uses_cong = joined[joined["STATEAB"].isin(["CA", "MD", "NJ", "NV", "RI", "VA", "WA"])]

joined_cong = uses_cong.groupby("UNQ_CONG_DIST_ID").sum()
joined_sldl = joined.groupby("UNQ_SLDL_DIST_ID").sum()
joined_sldu = joined.groupby("UNQ_SLDU_DIST_ID").sum()

joined_cong.reset_index(inplace = True, drop = False)
joined_sldl.reset_index(inplace = True, drop = False)
joined_sldu.reset_index(inplace = True, drop = False)

In [141]:
joined_cong.columns = ["ID", "Adj_Pop"]
joined_sldl.columns = ["ID", "Adj_Pop"]
joined_sldu.columns = ["ID", "Adj_Pop"]

joined_cong["Level"] = "CONG"
joined_sldl["Level"] = "SLDL"
joined_sldu["Level"] = "SLDU"

combined_files = pd.concat([joined_cong, joined_sldl, joined_sldu])

In [142]:
combined_files["State"] = combined_files["ID"].apply(lambda x: x[0:2])

In [143]:
combined_files["State"].value_counts()

PA    253
NY    213
CT    189
CA    172
VA    151
MT    150
MD    126
RI    116
WA    108
CO    100
NJ     92
HI     76
NV     67
DE     62
Name: State, dtype: int64

## Add in RI Data

In [164]:
ri_update_dict = {"RI-1":"RI-01",
"RI-2":"RI-02"}

combined_files["ID"] = combined_files["ID"].map(ri_update_dict).fillna(combined_files["ID"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["ID"] = combined_files["ID"].map(ri_update_dict).fillna(combined_files["ID"])


In [165]:
combined_files["unique_id"] = combined_files["Level"]+"-"+combined_files["ID"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["unique_id"] = combined_files["Level"]+"-"+combined_files["ID"]


In [166]:
combined_files

Unnamed: 0,ID,Adj_Pop,Level,State,unique_id,state-level,State-Level
0,CA-1,760066,CONG,CA,CONG-CA-1,CA-CONG,CA-CONG
1,CA-10,760066,CONG,CA,CONG-CA-10,CA-CONG,CA-CONG
2,CA-11,760067,CONG,CA,CONG-CA-11,CA-CONG,CA-CONG
3,CA-12,760065,CONG,CA,CONG-CA-12,CA-CONG,CA-CONG
4,CA-13,760065,CONG,CA,CONG-CA-13,CA-CONG,CA-CONG
...,...,...,...,...,...,...,...
552,WA-45,157270,SLDU,WA,SLDU-WA-45,WA-SLDU,WA-SLDU
553,WA-46,157255,SLDU,WA,SLDU-WA-46,WA-SLDU,WA-SLDU
554,WA-47,157240,SLDU,WA,SLDU-WA-47,WA-SLDU,WA-SLDU
555,WA-48,157252,SLDU,WA,SLDU-WA-48,WA-SLDU,WA-SLDU


In [167]:
ri_data = pd.read_csv("./raw-from-source/ri_sizes.csv",dtype={"Number":str, "Adj_Pop":int, "Level":str})

In [168]:
ri_data[ri_data["Level"]=="CONG"]

Unnamed: 0,Number,Adj_Pop,Level
75,1,548689,CONG
76,2,548690,CONG


In [169]:
ri_data["unique_id"] = ri_data["Level"]+"-RI-"+ri_data["Number"].astype(str).str.zfill(2)

In [170]:
ri_data["Adj_Pop"] = ri_data["Adj_Pop"].astype(int)

In [171]:
ri_data["unique_id"]

0      SLDL-RI-01
1      SLDL-RI-02
2      SLDL-RI-03
3      SLDL-RI-04
4      SLDL-RI-05
          ...    
110    SLDU-RI-34
111    SLDU-RI-35
112    SLDU-RI-36
113    SLDU-RI-37
114    SLDU-RI-38
Name: unique_id, Length: 115, dtype: object

In [174]:
ri_data[ri_data["Level"]=="CONG"]

Unnamed: 0,Number,Adj_Pop,Level,unique_id
75,1,548689,CONG,CONG-RI-01
76,2,548690,CONG,CONG-RI-02


In [175]:
ri_data_dict = dict(zip(ri_data["unique_id"], ri_data["Adj_Pop"]))

In [177]:
combined_files["Adj_Pop"] = combined_files["unique_id"].map(ri_data_dict).fillna(combined_files["Adj_Pop"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["Adj_Pop"] = combined_files["unique_id"].map(ri_data_dict).fillna(combined_files["Adj_Pop"])


In [178]:
combined_files[combined_files["unique_id"].str.contains("RI")]

Unnamed: 0,ID,Adj_Pop,Level,State,unique_id,state-level,State-Level
76,RI-01,548689.0,CONG,RI,CONG-RI-01,RI-CONG,RI-CONG
77,RI-02,548690.0,CONG,RI,CONG-RI-02,RI-CONG,RI-CONG
995,RI-01,14793.0,SLDL,RI,SLDL-RI-01,RI-SLDL,RI-SLDL
996,RI-02,14166.0,SLDL,RI,SLDL-RI-02,RI-SLDL,RI-SLDL
997,RI-03,15151.0,SLDL,RI,SLDL-RI-03,RI-SLDL,RI-SLDL
...,...,...,...,...,...,...,...
462,RI-34,28505.0,SLDU,RI,SLDU-RI-34,RI-SLDU,RI-SLDU
463,RI-35,30309.0,SLDU,RI,SLDU-RI-35,RI-SLDU,RI-SLDU
464,RI-36,30078.0,SLDU,RI,SLDU-RI-36,RI-SLDU,RI-SLDU
465,RI-37,27638.0,SLDU,RI,SLDU-RI-37,RI-SLDU,RI-SLDU


In [179]:
combined_files["state-level"] = combined_files["State"]+"-"+combined_files["Level"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["state-level"] = combined_files["State"]+"-"+combined_files["Level"]


In [180]:
combined_files["Adj_Pop"] = combined_files["Adj_Pop"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["Adj_Pop"] = combined_files["Adj_Pop"].astype(int)


In [181]:
combined_files[(combined_files["State"]=="PA") & (combined_files["Level"]=="SLDU")]

Unnamed: 0,ID,Adj_Pop,Level,State,unique_id,state-level,State-Level
379,PA-01,250243,SLDU,PA,SLDU-PA-01,PA-SLDU,PA-SLDU
380,PA-02,260277,SLDU,PA,SLDU-PA-02,PA-SLDU,PA-SLDU
381,PA-03,263993,SLDU,PA,SLDU-PA-03,PA-SLDU,PA-SLDU
382,PA-04,268248,SLDU,PA,SLDU-PA-04,PA-SLDU,PA-SLDU
383,PA-05,267205,SLDU,PA,SLDU-PA-05,PA-SLDU,PA-SLDU
384,PA-06,269699,SLDU,PA,SLDU-PA-06,PA-SLDU,PA-SLDU
385,PA-07,263697,SLDU,PA,SLDU-PA-07,PA-SLDU,PA-SLDU
386,PA-08,256726,SLDU,PA,SLDU-PA-08,PA-SLDU,PA-SLDU
387,PA-09,252137,SLDU,PA,SLDU-PA-09,PA-SLDU,PA-SLDU
388,PA-10,269925,SLDU,PA,SLDU-PA-10,PA-SLDU,PA-SLDU


## Check Again

In [182]:
combined_files["State-Level"] = combined_files["State"]+"-"+combined_files["Level"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["State-Level"] = combined_files["State"]+"-"+combined_files["Level"]


In [183]:
combined_files["State-Level"].value_counts()

PA-SLDL    203
CT-SLDL    151
NY-SLDL    150
VA-SLDL    100
MT-SLDL    100
CA-SLDL     80
RI-SLDL     75
MD-SLDL     71
CO-SLDL     65
NY-SLDU     63
CA-CONG     52
HI-SLDL     51
PA-SLDU     50
MT-SLDU     50
WA-SLDL     49
WA-SLDU     49
MD-SLDU     47
NV-SLDL     42
DE-SLDL     41
VA-SLDU     40
CA-SLDU     40
NJ-SLDU     40
NJ-SLDL     40
RI-SLDU     38
CT-SLDU     36
CO-SLDU     35
HI-SLDU     25
DE-SLDU     21
NV-SLDU     21
NJ-CONG     12
VA-CONG     11
WA-CONG     10
MD-CONG      8
NV-CONG      4
RI-CONG      2
Name: State-Level, dtype: int64

In [184]:
combined_files = combined_files[~combined_files["ID"].str.contains("NO")]

- ~~PA-SLDL    203~~
- ~~CT-SLDL    151~~
- ~~NY-SLDL    150~~
- ~~VA-SLDL    100~~
- ~~MT-SLDL    100~~
- ~~CA-SLDL     80~~
- ~~RI-SLDL     75~~
- ~~MD-SLDL     71~~
- ~~CO-SLDL     65~~
- ~~NY-SLDU     63~~
- ~~CA-CONG     52~~
- ~~HI-SLDL     51~~
- ~~PA-SLDU     50~~
- ~~MT-SLDU     50~~
- ~~WA-SLDL     49~~
- ~~WA-SLDU     49~~
- ~~MD-SLDU     47~~
- ~~NV-SLDL     42~~
- ~~DE-SLDL     41~~
- ~~VA-SLDU     40~~
- ~~CA-SLDU     40~~
- ~~NJ-SLDU     40~~
- ~~NJ-SLDL     40~~
- ~~RI-SLDU     38~~
- ~~CT-SLDU     36~~
- ~~CO-SLDU     35~~
- ~~HI-SLDU     25~~
- ~~DE-SLDU     21~~
- ~~NV-SLDU     21~~
- ~~NJ-CONG     12~~
- ~~VA-CONG     11~~
- ~~WA-CONG     10~~
- ~~MD-CONG      8~~
- ~~NV-CONG      4~~
- ~~RI-CONG      2~~

In [185]:
state_sums = combined_files.groupby("state-level").sum()

In [186]:
state_sums["Adj_Pop"]

state-level
CA-CONG    39523437
CA-SLDL    39523437
CA-SLDU    39523437
CO-SLDL     5773714
CO-SLDU     5773714
CT-SLDL     3603566
CT-SLDU     3603566
DE-SLDL      989598
DE-SLDU      989598
HI-SLDL     1383606
HI-SLDU     1383606
MD-CONG     6175403
MD-SLDL     6175403
MD-SLDU     6175403
MT-SLDL     1082717
MT-SLDU     1082717
NJ-CONG     9283016
NJ-SLDL     9283016
NJ-SLDU     9283016
NV-CONG     3104614
NV-SLDL     3104614
NV-SLDU     3104614
NY-SLDL    20193858
NY-SLDU    20193858
PA-SLDL    13002700
PA-SLDU    13002700
RI-CONG     1097379
RI-SLDL     1097379
RI-SLDU     1097379
VA-CONG     8631393
VA-SLDL     8631393
VA-SLDU     8631393
WA-CONG     7705281
WA-SLDL     7705281
WA-SLDU     7705281
Name: Adj_Pop, dtype: int64

- CA 39523437

- CO     5773714

- CT     3603566
- DE      989598

- HI    1383606
- MD    6175403

- MT    1082717

- NJ    9283016
- NV     3104614

- NY   20193858
- PA   13002700

- RI    1097379

- VA    8631393

- WA    7705281

In [121]:
combined_files.drop(["unique_id","state-level","State-Level"], axis = 1, inplace = True)
combined_files["ID"] = combined_files["ID"].apply(lambda x: x.split("-")[1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files.drop(["unique_id","state-level","State-Level"], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_files["ID"] = combined_files["ID"].apply(lambda x: x.split("-")[1])


In [187]:
combined_files.to_csv("./adjusted_districts_pop.csv", index = False)

## Unadjusted Data

In [None]:
# national_baf.columns
# national_baf.loc[national_baf.shape[0]] = [20160001001440, "AK", "AT-LARGE", "S", 37, None]
# national_baf.loc[national_baf.shape[0]] = [20160001001441, "AK", "AT-LARGE", "S", 37, None]

In [None]:
unadjusted_data_dict = {}
unadjusted_data_list = []

path = "/Users/peterhorton/Documents/RDH/raw_data/census/2020_PL_csv/"


for file in os.listdir(path):
    if "pl2020_b" in file and "bg" not in file and ".zip" not in file:
        header = file[0:2].upper()
        print(file)
        unadj_state = pd.read_csv(path + "/" + file + "/" + file + ".csv")
        unadj_state.drop([i for i in unadj_state.columns if i not in ["GEOID20", "P0010001"]], inplace = True, axis = 1)
        unadjusted_data_list.append(unadj_state)

In [None]:
unadj_state_data_df = pd.concat(unadjusted_data_list)

In [None]:
# unadj_pop_dict = dict(zip(unadj_state_data_df["GEOID20"], unadj_state_data_df["P0010001"]))

# adj_pop_dict = dict(zip(adj_state_data_df["GEOID20"], adj_state_data_df["Adj_Pop"]))

# national_baf["TOT_POP"] = national_baf["GEOID20"].map(unadj_pop_dict)
# national_baf["ADJ_POP"] = national_baf["GEOID20"].map(adj_pop_dict)

# tots = pd.read_csv("./national_baf_w_pop.csv")

In [None]:
final_merge.drop(["_merge","final_merge"], axis = 1, inplace = True)

## Checks:

In [None]:
final_merge["Adj_Pop"] = final_merge["Adj_Pop"].fillna(0)

In [None]:
final_merge["Adj_Pop"] = final_merge["Adj_Pop"].astype(int)

In [None]:
summed_states = final_merge.groupby("STATEAB").sum()

#### Note: Checking the adjusted population sums against the reports on the RDH website:

https://redistrictingdatahub.org/data/ongoing-data-projects/states-that-adjust-the-census-data-for-redistricting/

In [None]:
sum(summed_states["P0010001"])

In [None]:
summed_states

In [None]:
national_baf["ADJ_POP"].value_counts(dropna = False)

In [None]:
national_baf.to_csv("./national_baf_w_pop.csv", index = False)

In [None]:
sum(pop_data[pop_data["_merge"]=="right_only"]["P0010001"])

In [None]:
unadj_state_data_df.to_csv("./unadjusted_data_GEOID_pop.csv", index = False)

## Load in PA and HI

In [None]:
state = "PA"
pa_data = pd.read_csv("./raw-from-source/"+state+"_blocks.csv")

def mod_census(block_id):
    block_id = str(block_id)
    if "A" in block_id or "B" in block_id or "C" in block_id:
        return block_id[:len(block_id)-1]
    else:
        return block_id
    
pa_data["mod_GEOID20"] = pa_data["GEOID20"].apply(lambda x: mod_census(x))

pa_data["mod_GEOID20"].value_counts(dropna = False)

pa_data_mod = pa_data.groupby("mod_GEOID20").sum()

pa_data_mod.reset_index(drop = False, inplace = True)

pa_data_mod.rename(columns = {"mod_GEOID20":"GEOID20"}, inplace = True)

pa_data_mod = pa_data_mod[["GEOID20", "Adj_Pop"]]
adjusted_data_list.append(pa_data_mod)
        

In [None]:
hi_data = gp.read_file("/Users/peterhorton/Documents/RDH/Support/Official_State_Files/Hawaii/hi_pl2020_b_official/HIblocksFPBHPB826.shp")

sum(hi_data["HPB826"])

sum(hi_data["FPB"])

hi_data.rename(columns = {"HPB826":"Adj_Pop"}, inplace = True)
hi_data = hi_data[["GEOID20", "Adj_Pop"]]

adjusted_data_list.append(hi_data)

In [None]:
hi_data.to_csv("./HI_blocks.csv", index = False)