# 2022 Congressional Districts with Total Population from Census PL file 09/30/22

## Background:
We received a data request asking for total populations of the 2022 congressional districts.

Note that some states adjust their redistricting data, and that processing can be found [here]<https://github.com/nonpartisan-redistricting-datahub/Processing-Requests/blob/main/Adjusted_Districts_Pop_09_28_22/README.md>

## Approach:

- Concatenate PL data for all of the states
- Join to the BAF available from the RDH
- Groupby congressional district, and join to the national 2022 congressional file
- Check file
- Export file

## Links to Download Raw Files 
- [National BAF for 2022 Districts](https://redistrictingdatahub.org/dataset/national-block-assignment-file-for-2022-state-legislative-and-congressional-districts/)
- [National Congressional Districts for 2022](https://redistrictingdatahub.org/dataset/national-congressional-districts-for-2022/)
- 2020 PL data by state is available from [the RDH](https://redistrictingdatahub.org/data/download-data/)

## Processing Steps:
See attached notebook

**Note: A full "raw-from-source" file is also available upon request. Please email info@redistrictingdatahub.org for more info.


In [1]:
import pandas as pd
import geopandas as gp
import os

baf = pd.read_csv('./national_baf.csv')
state_abrvs = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
def national_pl():
    pl_concat = pd.DataFrame()
    for state in state_abrvs:
        print(f"reading in {state}")
        pl = pd.read_csv(f'./csv_pl/{state.lower()}_pl2020_b.csv', dtype='unicode', low_memory=False)[['GEOID20', 'P0010001']]
        pl_concat = pd.concat([pl_concat, pl], sort=False)
        
    return pl_concat   


def baf_pl_merge():
    global natpl
    natpl = national_pl()
    natpl['GEOID20'] = natpl['GEOID20'].astype(str).str.zfill(16)
    baf['GEOID20'] = baf['GEOID20'].astype(str).str.zfill(16)
    baf_pl = baf.merge(natpl, on='GEOID20', how='outer', indicator=False)
    
    return baf_pl

In [3]:
baf_pl = baf_pl_merge()

reading in AK
reading in AL
reading in AR
reading in AZ
reading in CA
reading in CO
reading in CT
reading in DE
reading in FL
reading in GA
reading in HI
reading in IA
reading in ID
reading in IL
reading in IN
reading in KS
reading in KY
reading in LA
reading in MA
reading in MD
reading in ME
reading in MI
reading in MN
reading in MO
reading in MS
reading in MT
reading in NC
reading in ND
reading in NE
reading in NH
reading in NJ
reading in NM
reading in NV
reading in NY
reading in OH
reading in OK
reading in OR
reading in PA
reading in RI
reading in SC
reading in SD
reading in TN
reading in TX
reading in UT
reading in VA
reading in VT
reading in WA
reading in WI
reading in WV
reading in WY


In [4]:
baf_pl

Unnamed: 0,GEOID20,STATEAB,CONG,SLDU,SLDL,FLOTERIAL,P0010001
0,0010010201001000,AL,2,30,69,,21
1,0010010201001001,AL,2,30,69,,34
2,0010010201001002,AL,2,30,69,,29
3,0010010201001003,AL,2,30,69,,17
4,0010010201001004,AL,2,30,69,,0
...,...,...,...,...,...,...,...
8126951,0170978656002000,,,,,,0
8126952,0170979900000001,,,,,,0
8126953,0170979900000002,,,,,,0
8126954,0170979900000003,,,,,,0


In [5]:
print(baf.shape)
print(natpl.shape)

(8126714, 6)
(8126956, 2)


In [6]:
baf_pl[baf_pl['STATEAB'].isna()]

Unnamed: 0,GEOID20,STATEAB,CONG,SLDU,SLDL,FLOTERIAL,P0010001
8126714,0020160001001274,,,,,,0
8126715,0020160001001275,,,,,,0
8126716,0020160001001276,,,,,,0
8126717,0020160001001277,,,,,,0
8126718,0020160001001278,,,,,,0
...,...,...,...,...,...,...,...
8126951,0170978656002000,,,,,,0
8126952,0170979900000001,,,,,,0
8126953,0170979900000002,,,,,,0
8126954,0170979900000003,,,,,,0


In [7]:
baf_pl[(baf_pl['STATEAB'].isna())&(baf_pl['P0010001']!='0')]

Unnamed: 0,GEOID20,STATEAB,CONG,SLDU,SLDL,FLOTERIAL,P0010001


In [8]:
len(baf_pl['GEOID20'].str.slice(stop=3).value_counts())

50

In [9]:
def check_state_totals():
    natpl['statefips'] = natpl['GEOID20'].str.slice(stop=3)
    natpl['P0010001'] = natpl['P0010001'].astype(int)
    pl_gpby = natpl.groupby(['statefips']).sum()
    
    baf_pl['statefips'] = baf_pl['GEOID20'].str.slice(stop=3)
    baf_pl['P0010001'] = baf_pl['P0010001'].astype(int)
    baf_pl_gpby = baf_pl.groupby(['statefips']).sum()
    
    return pl_gpby == baf_pl_gpby
    
check_state_totals()

Unnamed: 0_level_0,P0010001
statefips,Unnamed: 1_level_1
1,True
2,True
4,True
5,True
6,True
8,True
9,True
10,True
12,True
13,True


In [10]:
baf_pl = baf_pl[(~baf_pl['STATEAB'].isna())]#|(baf_pl['P0010001']!=0)]
baf_pl['CD_ID'] = baf_pl['STATEAB'].astype(str) + '-' + baf_pl['CONG'].astype(str).str.upper().str.zfill(3)

cd = gp.read_file(f'zip+s3://data.redistrictingdatahub.org/web_ready_stage/NATIONAL/national_cong_2022.zip')
cd['CD_ID'] = cd['STATE'].astype(str) + '-' + cd['DISTRICT'].astype(str).str.upper().str.zfill(3)

baf_pl_sum = baf_pl.groupby(['CD_ID']).sum()

ms_dict = {'MS-2801':'MS-001', 'MS-2802':'MS-002', 'MS-2803':'MS-003', 'MS-2804':'MS-004'}
cd.loc[(cd['CD_ID'].str.contains('MS-')), 'CD_ID'] = cd.loc[(cd['CD_ID'].str.contains('MS-')), 'CD_ID'].map(ms_dict)
cd_pop_geo = cd.merge(baf_pl_sum, on="CD_ID", how='outer', indicator=True)
#cd_pop_geo.loc[cd_pop_geo['STATE']=='AK', 'P0010001'] = cd_pop_geo.loc[cd_pop_geo['STATE']=='AK', 'P0010001']+232
cd_pop_geo = cd_pop_geo[~cd_pop_geo['STATE'].isna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baf_pl['CD_ID'] = baf_pl['STATEAB'].astype(str) + '-' + baf_pl['CONG'].astype(str).str.upper().str.zfill(3)


In [11]:
cd[cd['STATE']=='MS']

Unnamed: 0,DISTRICT,STATE,geometry,CD_ID
227,2801,MS,"POLYGON ((-9830026.095 3933636.182, -9830032.6...",MS-001
228,2802,MS,"POLYGON ((-10023886.339 3838108.534, -10023897...",MS-002
229,2803,MS,"POLYGON ((-10068852.210 3838874.810, -10068842...",MS-003
230,2804,MS,"POLYGON ((-9988372.388 3633047.692, -9988019.2...",MS-004


In [12]:
cd.loc[(cd['CD_ID'].str.contains('MS-'))]

Unnamed: 0,DISTRICT,STATE,geometry,CD_ID
227,2801,MS,"POLYGON ((-9830026.095 3933636.182, -9830032.6...",MS-001
228,2802,MS,"POLYGON ((-10023886.339 3838108.534, -10023897...",MS-002
229,2803,MS,"POLYGON ((-10068852.210 3838874.810, -10068842...",MS-003
230,2804,MS,"POLYGON ((-9988372.388 3633047.692, -9988019.2...",MS-004


In [13]:
cd_pop_geo

Unnamed: 0,DISTRICT,STATE,geometry,CD_ID,P0010001,_merge
0,At-Large,AK,"MULTIPOLYGON (((-18455563.423 7215576.889, -18...",AK-AT-LARGE,733391,both
1,1,AL,"POLYGON ((-9751458.168 3632414.860, -9751458.1...",AL-001,717754,both
2,2,AL,"POLYGON ((-9563720.783 3799405.395, -9563217.0...",AL-002,717755,both
3,3,AL,"POLYGON ((-9635097.250 3897801.575, -9635008.9...",AL-003,717754,both
4,4,AL,"POLYGON ((-9590022.743 4100677.324, -9589939.0...",AL-004,717754,both
...,...,...,...,...,...,...
430,7,WI,"MULTIPOLYGON (((-9995933.342 5824402.270, -999...",WI-007,736715,both
431,8,WI,"MULTIPOLYGON (((-9910574.113 5503101.483, -991...",WI-008,736714,both
432,1,WV,"POLYGON ((-8938523.270 4535974.462, -8938520.5...",WV-001,896067,both
433,2,WV,"POLYGON ((-9030263.119 4787731.719, -9030173.7...",WV-002,897649,both


In [14]:
def check_state_totals_from_CD():
    cd_gpby = cd_pop_geo.groupby(['STATE']).sum()
    baf_pl['P0010001'] = baf_pl['P0010001'].astype(int)
    baf_cd_gpby = baf_pl.groupby(['STATEAB']).sum()
    
    return cd_gpby ==baf_cd_gpby
    
check_state_totals_from_CD()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baf_pl['P0010001'] = baf_pl['P0010001'].astype(int)


Unnamed: 0_level_0,P0010001
STATE,Unnamed: 1_level_1
AK,True
AL,True
AR,True
AZ,True
CA,True
CO,True
CT,True
DE,True
FL,True
GA,True


In [15]:
cd_pop_geo.columns

Index(['DISTRICT', 'STATE', 'geometry', 'CD_ID', 'P0010001', '_merge'], dtype='object')

In [16]:
def check_max_min(joined_cong):
    for val in list(joined_cong["STATE"].unique()):
        '''print(val)
        print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
        print("")'''
        #print("REPORT")
        if abs(max(joined_cong[joined_cong["STATE"]==val]["P0010001"]) - min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))>10:
            print(val)
            print("MAX:", max(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("MIN:", min(joined_cong[joined_cong["STATE"]==val]["P0010001"]))
            print("")

check_max_min(cd_pop_geo)

AR
MAX: 753219
MIN: 752509

CA
MAX: 782247
MIN: 754875

CO
MAX: 721794
MIN: 721664

HI
MAX: 728876
MIN: 726395

IA
MAX: 797645
MIN: 797551

LA
MAX: 776333
MIN: 776268

MD
MAX: 777845
MIN: 767247

MI
MAX: 775666
MIN: 774544

NE
MAX: 653847
MIN: 653822

NJ
MAX: 779056
MIN: 771744

NM
MAX: 705846
MIN: 705832

NV
MAX: 778140
MIN: 773758

RI
MAX: 549301
MIN: 548078

VA
MAX: 788614
MIN: 779587

WA
MAX: 774871
MIN: 768710

WV
MAX: 897649
MIN: 896067



In [17]:
export_gdf = cd_pop_geo[['STATE','DISTRICT','CD_ID','P0010001','geometry']]
export_df = cd_pop_geo[['STATE','DISTRICT','CD_ID','P0010001']]

In [161]:
os.mkdir('./cd_pop_2022_csv')
os.mkdir('./cd_pop_2022_shp')

In [18]:
export_df.to_csv('./cd_pop_2022_csv/cd_pop_2022_csv.csv')
export_gdf.to_file('./cd_pop_2022_shp/cd_pop_2022_shp.shp')