# 2022 Congressional Districts with Total Population from Census PL file 09/30/22

## Background:
We received a data request asking for total populations of the 2022 congressional districts.

Note that some states adjust their redistricting data, and that processing can be found [here]<https://github.com/nonpartisan-redistricting-datahub/Processing-Requests/blob/main/Adjusted_Districts_Pop_09_28_22/README.md>

## Approach:

- Concatenate PL data for all of the states
- Join to the BAF available from the RDH
- Groupby congressional district, and join to the national 2022 congressional file
- Check file
- Export file

## Links to Download Raw Files 
- [National BAF for 2022 Districts](https://redistrictingdatahub.org/dataset/national-block-assignment-file-for-2022-state-legislative-and-congressional-districts/)
- [National Congressional Districts for 2022](https://redistrictingdatahub.org/dataset/national-congressional-districts-for-2022/)
- 2020 PL data by state is available from [the RDH](https://redistrictingdatahub.org/data/download-data/)

## Processing Steps:
See attached notebook

**Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org for more info.


In [11]:
import pandas as pd
import geopandas as gp
import os

baf = pd.read_csv('./national_baf_leg_cong.csv')[['GEOID20', 'STATE', 'CONG', 'SLDU', 'SLDL', 'FLOTERIAL']]
state_abrvs = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
def national_pl():
    pl_concat = pd.DataFrame()
    for state in state_abrvs:
        print(f"reading in {state}")
        pl = pd.read_csv(f'./csv_pl/{state.lower()}_pl2020_b/{state.lower()}_pl2020_b.csv', dtype='unicode', low_memory=False)[['GEOID20', 'P0010001']]
        pl_concat = pd.concat([pl_concat, pl], sort=False)
        
    return pl_concat   


def baf_pl_merge():
    global natpl
    natpl = national_pl()
    natpl['GEOID20'] = natpl['GEOID20'].astype(str).str.zfill(16)
    baf['GEOID20'] = baf['GEOID20'].astype(str).str.zfill(16)
    baf_pl = baf.merge(natpl, on='GEOID20', how='outer', indicator=False)
    
    return baf_pl

In [13]:
baf_pl = baf_pl_merge()

reading in AK
reading in AL
reading in AR
reading in AZ
reading in CA
reading in CO
reading in CT
reading in DE
reading in FL
reading in GA
reading in HI
reading in IA
reading in ID
reading in IL
reading in IN
reading in KS
reading in KY
reading in LA
reading in MA
reading in MD
reading in ME
reading in MI
reading in MN
reading in MO
reading in MS
reading in MT
reading in NC
reading in ND
reading in NE
reading in NH
reading in NJ
reading in NM
reading in NV
reading in NY
reading in OH
reading in OK
reading in OR
reading in PA
reading in RI
reading in SC
reading in SD
reading in TN
reading in TX
reading in UT
reading in VA
reading in VT
reading in WA
reading in WI
reading in WV
reading in WY


In [14]:
baf.columns

Index(['GEOID20', 'STATE', 'CONG', 'SLDU', 'SLDL', 'FLOTERIAL'], dtype='object')

In [15]:
baf_pl

Unnamed: 0,GEOID20,STATE,CONG,SLDU,SLDL,FLOTERIAL,P0010001
0,0240010001001000,MD,6,1,1C,NO FLOTERIAL,31
1,0240010001001001,MD,6,1,1C,NO FLOTERIAL,4
2,0240010001001002,MD,6,1,1C,NO FLOTERIAL,4
3,0240010001001003,MD,6,1,1C,NO FLOTERIAL,0
4,0240010001001004,MD,6,1,1C,NO FLOTERIAL,0
...,...,...,...,...,...,...,...
8126951,0301119400022066,MT,2,28,56,NO FLOTERIAL,6
8126952,0301119400022067,MT,2,28,56,NO FLOTERIAL,409
8126953,0301119400022068,MT,2,28,56,NO FLOTERIAL,7
8126954,0301119400022069,MT,2,28,56,NO FLOTERIAL,0


In [16]:
baf

Unnamed: 0,GEOID20,STATE,CONG,SLDU,SLDL,FLOTERIAL
0,0240010001001000,MD,6,1,1C,NO FLOTERIAL
1,0240010001001001,MD,6,1,1C,NO FLOTERIAL
2,0240010001001002,MD,6,1,1C,NO FLOTERIAL
3,0240010001001003,MD,6,1,1C,NO FLOTERIAL
4,0240010001001004,MD,6,1,1C,NO FLOTERIAL
...,...,...,...,...,...,...
8126951,0301119400022066,MT,2,28,56,NO FLOTERIAL
8126952,0301119400022067,MT,2,28,56,NO FLOTERIAL
8126953,0301119400022068,MT,2,28,56,NO FLOTERIAL
8126954,0301119400022069,MT,2,28,56,NO FLOTERIAL


In [17]:
print(baf.shape)
print(natpl.shape)

(8126956, 6)
(8126956, 2)


In [18]:
len(baf_pl['GEOID20'].str.slice(stop=3).value_counts())

50

In [19]:
natpl

Unnamed: 0,GEOID20,P0010001
0,0020130001001000,0
1,0020130001001001,0
2,0020130001001002,0
3,0020130001001003,0
4,0020130001001004,0
...,...,...
53764,0560459513003058,32
53765,0560459513003059,43
53766,0560459513003060,17
53767,0560459513003061,40


In [20]:
def check_state_totals():
    natpl['statefips'] = natpl['GEOID20'].str.slice(stop=3)
    natpl['P0010001'] = natpl['P0010001'].astype(int)
    pl_gpby = natpl.groupby(['statefips']).sum()
    
    baf_pl['statefips'] = baf_pl['GEOID20'].str.slice(stop=3)
    baf_pl['P0010001'] = baf_pl['P0010001'].astype(int)
    baf_pl_gpby = baf_pl.groupby(['statefips']).sum()
    
    return pl_gpby == baf_pl_gpby
    
check_state_totals()

Unnamed: 0_level_0,P0010001
statefips,Unnamed: 1_level_1
1,True
2,True
4,True
5,True
6,True
8,True
9,True
10,True
12,True
13,True


In [22]:
baf_pl

Unnamed: 0,GEOID20,STATE,CONG,SLDU,SLDL,FLOTERIAL,P0010001,statefips
0,0240010001001000,MD,6,1,1C,NO FLOTERIAL,31,024
1,0240010001001001,MD,6,1,1C,NO FLOTERIAL,4,024
2,0240010001001002,MD,6,1,1C,NO FLOTERIAL,4,024
3,0240010001001003,MD,6,1,1C,NO FLOTERIAL,0,024
4,0240010001001004,MD,6,1,1C,NO FLOTERIAL,0,024
...,...,...,...,...,...,...,...,...
8126951,0301119400022066,MT,2,28,56,NO FLOTERIAL,6,030
8126952,0301119400022067,MT,2,28,56,NO FLOTERIAL,409,030
8126953,0301119400022068,MT,2,28,56,NO FLOTERIAL,7,030
8126954,0301119400022069,MT,2,28,56,NO FLOTERIAL,0,030


### Note: zfill is set to three because in the Congressional shapefile (linked from S3 below) the CD IDs are filled to three. If the future version of the file is filled to two only, then change zfill to be 2 instead.



In [60]:
baf_pl = baf_pl[(~baf_pl['STATE'].isna())|(baf_pl['P0010001']!=0)]
baf_pl['CD_ID'] = baf_pl['STATE'].astype(str) + '-' + baf_pl['CONG'].astype(str).str.upper().str.zfill(3)

#@PETER ADD S3 LINK HERE!!
cd = gp.read_file(f'zip+ ')
cd['CD_ID'] = cd['STATE'].astype(str) + '-' + cd['DISTRICT'].astype(str).str.upper().str.zfill(3)

baf_pl_sum = baf_pl.groupby(['CD_ID']).sum()

ms_dict = {'MS-2801':'MS-001', 'MS-2802':'MS-002', 'MS-2803':'MS-003', 'MS-2804':'MS-004'}
cd.loc[(cd['CD_ID'].str.contains('MS-')), 'CD_ID'] = cd.loc[(cd['CD_ID'].str.contains('MS-')), 'CD_ID'].map(ms_dict)
ms_dist_dict = {'2801':'01', '2802':'02', '2803':'03', '2804':'04'}
cd.loc[(cd['CD_ID'].str.contains('MS-')), 'DISTRICT'] = cd.loc[(cd['CD_ID'].str.contains('MS-')), 'DISTRICT'].map(ms_dist_dict)

cd_pop_geo = cd.merge(baf_pl_sum, on="CD_ID", how='outer', indicator=True)
cd_pop_geo = cd_pop_geo[~cd_pop_geo['STATE'].isna()]

In [61]:
cd_pop_geo[cd_pop_geo['STATE'].isna()]

Unnamed: 0,DISTRICT,STATE,geometry,CD_ID,P0010001,_merge


In [62]:
def check_state_totals_from_CD():
    cd_gpby = cd_pop_geo.groupby(['STATE']).sum()
    baf_pl['P0010001'] = baf_pl['P0010001'].astype(int)
    baf_cd_gpby = baf_pl.groupby(['STATE']).sum()
    
    return cd_gpby ==baf_cd_gpby
    
check_state_totals_from_CD()

Unnamed: 0_level_0,P0010001
STATE,Unnamed: 1_level_1
AK,True
AL,True
AR,True
AZ,True
CA,True
CO,True
CT,True
DE,True
FL,True
GA,True


In [63]:
def check_max_min(joined_cong):
    for val in list(joined_cong["STATE"].unique()):
        '''print(val)
        print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("")'''
        #print("REPORT")
        if abs(max(joined_cong[joined_cong["STATE"]==val]["P0010001"]) - min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))>10:
            print(val)
            print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("")

check_max_min(cd_pop_geo)

AR
MAX: 753219
MIN: 752509

CA
MAX: 782247
MIN: 754875

CO
MAX: 721794
MIN: 721664

HI
MAX: 728876
MIN: 726395

IA
MAX: 797645
MIN: 797551

LA
MAX: 776333
MIN: 776268

MD
MAX: 777845
MIN: 767247

MI
MAX: 775666
MIN: 774544

NE
MAX: 653847
MIN: 653822

NJ
MAX: 779056
MIN: 771744

NM
MAX: 705846
MIN: 705832

NV
MAX: 778140
MIN: 773758

RI
MAX: 549301
MIN: 548078

VA
MAX: 788614
MIN: 779587

WA
MAX: 774871
MIN: 768710

WV
MAX: 897649
MIN: 896067



In [64]:
export_gdf = cd_pop_geo[['STATE','DISTRICT','CD_ID','P0010001','geometry']]
export_df = cd_pop_geo[['STATE','DISTRICT','CD_ID','P0010001']]

In [65]:
export_df.head()

Unnamed: 0,STATE,DISTRICT,CD_ID,P0010001
0,AK,At-Large,AK-AT-LARGE,733391
1,AL,1,AL-001,717754
2,AL,2,AL-002,717755
3,AL,3,AL-003,717754
4,AL,4,AL-004,717754


In [161]:
os.mkdir('./national_cd_pop_2022_csv')
os.mkdir('./national_cd_pop_2022_shp')

In [18]:
export_df.to_csv('./national_cd_pop_2022_csv/national_cd_pop_2022_csv.csv', index = False)
export_gdf.to_file('./national_cd_pop_2022_shp/national_cd_pop_2022_shp.shp')