# Preparing County-Level Choropleth Map Data

To streamline the Streamlit app's computations, we prepare the static data involved in the choropleth map. This involves collecting all that we know about the AP performances, Income, and Population for each county in Georgia, Massachusetts, and Wisconsin, as well as the shape data so we may draw those counties.

In [104]:
import os
import pandas as pd
import numpy as np
import math
from pathlib import Path
import geopandas as gpd
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from data_loaders import *
prefix = '../'

# The Order of Building the Choropleth Data
0. Collect all counties and years under consideration
1. Set up skeleton: all combinations of counties and years
2. Gather all AP performance data per state
3. Fill in the skeleton with the performance data for all three states individually
4. Collect all income and population data for each county
5. Fill in the skeleton with the income and population data
6. Output the enriched skeleton

## All GEOIDs

In [105]:
# Mapping of state abbreviations to FIPS codes
state_fips_codes = {
    'WI': '55',
    'MA': '25',
    'GA': '13'
}
states_of_interest = state_fips_codes.keys()
state_fips_of_interest = [state_fips_codes[state] for state in states_of_interest]

# Load and filter the shapefile
gdf = gpd.read_file('../data/county_shapes/cb_2018_us_county_5m.shp')
gdf_states = gdf[gdf['STATEFP'].isin(state_fips_of_interest)]
gdf_states.reset_index(inplace = True)
fips_to_state = { code: abbr for abbr, code in state_fips_codes.items() }
gdf_states['State_Abbreviation'] = gdf_states['STATEFP'].map(fips_to_state)
gdf_states['GEOID'] = gdf_states['GEOID'].astype(str)
gdf_states.rename(columns = {'NAME' : 'County'}, inplace = True)
gdf_states = gdf_states[['County', 'State_Abbreviation', 'GEOID', 'geometry']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_states.rename(columns = {'NAME' : 'County'}, inplace = True)


## Cross with all Years

In [106]:
years = [2019, 2020, 2021, 2022]
geoids = gdf_states['GEOID'].unique()
# skeleton = pd.DataFrame([(year, geoid) for year in years for geoid in geoids], columns=['Year', 'GEOID'])
skeleton = gdf_states.merge(pd.DataFrame({'Year': years}), how = 'cross')
skeleton['PassRate'] = None
skeleton['Income'] = None
skeleton['Population'] = None
skeleton.head()

Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income,Population
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,,
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,,
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,,,
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,,,
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,,,


## Get AP Performance for each State

### Georgia

In [107]:
GA_raw = gimmeGA(prefix = prefix)
GA_raw.head()

Unnamed: 0,SCHOOL_DSTRCT_NM,TEST_CMPNT_TYP_NM,NUMBER_TESTS_TAKEN,NOTESTS_3ORHIGHER,Year,Pass Rate,County
0,Appling County,ALL Subjects,92.0,37.0,2019,0.402174,Appling
1,Appling County,Biology,12.0,7.0,2019,0.583333,Appling
2,Appling County,Eng. Literature & Comp,10.0,3.0,2019,0.3,Appling
3,Appling County,Geography: Human,30.0,7.0,2019,0.233333,Appling
4,Appling County,Psychology,23.0,13.0,2019,0.565217,Appling


In [108]:
GA_map_data = GA_raw[GA_raw['TEST_CMPNT_TYP_NM'] == 'ALL Subjects'][['County', 'Year', 'Pass Rate']]
GA_map_data.reset_index(inplace = True)
GA_map_data['State_Abbreviation'] = 'GA'
GA_map_data.rename(columns = {'Pass Rate' : 'PassRate'}, inplace = True)
GA_map_data = GA_map_data[['County', 'State_Abbreviation', 'Year', 'PassRate']]
GA_map_data['PassRate'] *= 100

# Change some problematic names
GA_map_data['County'] = GA_map_data['County'].replace('Savannah-Chatham', 'Chatham')
GA_map_data['County'] = GA_map_data['County'].replace('Griffin-Spalding', 'Spalding')
GA_map_data['County'] = GA_map_data['County'].replace('Thomaston-Upson', 'Upson')
GA_map_data

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Appling,GA,2019,40.217391
1,Atkinson,GA,2019,21.621622
2,Baldwin,GA,2019,28.260870
3,Banks,GA,2019,43.243243
4,Barrow,GA,2019,46.895641
...,...,...,...,...
445,Worth,GA,2022,35.135135
446,Fulton,GA,2019,64.797958
447,Fulton,GA,2020,50.884339
448,Fulton,GA,2021,46.999291


### Georgia

In [109]:
GA_raw = gimmeGA(prefix = prefix)
GA_raw = GA_raw[GA_raw['TEST_CMPNT_TYP_NM'] == 'ALL Subjects'][['County', 'Year', 'Pass Rate']]
GA_raw.reset_index(inplace = True)
GA_raw.rename(columns = {'Pass Rate' : 'PassRate'}, inplace = True)
GA_raw['State_Abbreviation'] = 'GA'
GA_raw['PassRate'] *= 100
GA_map_data = GA_raw[['County', 'State_Abbreviation', 'Year', 'PassRate']]
GA_map_data.head()

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Appling,GA,2019,40.217391
1,Atkinson,GA,2019,21.621622
2,Baldwin,GA,2019,28.26087
3,Banks,GA,2019,43.243243
4,Barrow,GA,2019,46.895641


### Massachusetts

In [110]:
# MA_map_data
MA_raw = pd.read_excel(prefix + 'data/MA_data/county_passrate_19_22.xlsx', sheet_name='2019-22')
MA_raw.rename(columns = {'Pass Rate' : 'PassRate'}, inplace = True)
MA_raw['State_Abbreviation'] = 'MA'
MA_raw['PassRate'] *= 100
MA_map_data = MA_raw[['County', 'State_Abbreviation', 'Year', 'PassRate']]
MA_map_data

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Barnstable,MA,2019,90.843271
1,Berkshire,MA,2019,75.224955
2,Bristol,MA,2019,91.726619
3,Essex,MA,2019,85.053191
4,Franklin,MA,2019,86.610879
5,Hampden,MA,2019,87.232182
6,Hampshire,MA,2019,89.751319
7,Middlesex,MA,2019,70.509199
8,Nantucket,MA,2019,83.823529
9,Norfolk,MA,2019,78.714436


### Wisconsin

In [111]:
WI_raw = pd.read_csv('../data/Wisconsin/Wisconsin_combined.csv')
WI_raw.rename(columns = {
    'COUNTY' : 'County',
    'PERCENT_3_OR_ABOVE' : 'PassRate'
    }, inplace = True)
rename_years = {
    '2018-19' : 2018, 
    '2019-20' : 2019, 
    '2020-21' : 2020, 
    '2021-22' : 2021, 
    '2022-23' : 2022
}
WI_raw['Year'] = WI_raw.apply(lambda row : rename_years[row['Year']], axis = 1)
WI_raw['State_Abbreviation'] = 'WI'
WI_raw['County'] = WI_raw['County'].replace('Saint Croix', 'St. Croix')
WI_map_data = WI_raw[['County', 'State_Abbreviation', 'Year', 'PassRate']]
WI_map_data.head()

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Adams,WI,2018,28.947368
1,Ashland,WI,2018,62.5
2,Barron,WI,2018,51.530612
3,Bayfield,WI,2018,64.285714
4,Brown,WI,2018,69.745411


## Add AP Performance to Skeleton

In [112]:
map_data = pd.concat([GA_map_data, MA_map_data, WI_map_data], ignore_index = True) 
merged = skeleton.merge(
    map_data,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_map_data')
)
merged['PassRate'] = merged['PassRate'].fillna(merged['PassRate_map_data'])
merged = merged.drop(columns=['PassRate_map_data'])
skeleton = merged
skeleton.head()

  merged['PassRate'] = merged['PassRate'].fillna(merged['PassRate_map_data'])


Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income,Population
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,,
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,,
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,33.333333,,
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,72.727273,,
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,61.290323,,


## Get Income and Population Data

### Income

In [113]:
CAINC_raw = pd.read_csv(prefix + 'data/CAINC_Incomes_Counties_2019_2022.csv')
incomes = CAINC_raw[CAINC_raw['Description'] == 'Per capita personal income (dollars) 2/']
incomes = incomes[incomes['GeoName'].str.contains(',')]
def split_county_state(geoName, county = True):
    S = geoName.split(', ')
    return S[0] if county else S[len(S) - 1]
incomes['County'] = incomes.apply(lambda row: split_county_state(row['GeoName'], county = True), axis = 1)
incomes['State_Abbreviation'] = incomes.apply(lambda row: split_county_state(row['GeoName'], county = False), axis = 1)
incomes[['County', 'State_Abbreviation', '2019', '2020', '2021', '2022']]
melted = incomes.melt(
    id_vars = ['County', 'State_Abbreviation'],
    value_vars = ['2019', '2020', '2021', '2022'],
    var_name = 'Year',
    value_name = 'Income'
)
melted['Year'] = melted['Year'].astype(int)
melted['County'] = melted['County'].replace('Saint Croix', 'St. Croix')
incomes = melted
incomes.head(10)

Unnamed: 0,County,State_Abbreviation,Year,Income
0,Autauga,AL,2019,42550
1,Baldwin,AL,2019,47911
2,Barbour,AL,2019,34685
3,Bibb,AL,2019,32104
4,Blount,AL,2019,36561
5,Bullock,AL,2019,27124
6,Butler,AL,2019,37702
7,Calhoun,AL,2019,36747
8,Chambers,AL,2019,33542
9,Cherokee,AL,2019,37831


### Population

In [114]:
population = CAINC_raw[CAINC_raw['Description'] == 'Population (persons) 1/']
population = population[population['GeoName'].str.contains(',')]
def split_county_state(geoName, county = True):
    S = geoName.split(', ')
    return S[0] if county else S[len(S) - 1]
population['County'] = population.apply(lambda row: split_county_state(row['GeoName'], county = True), axis = 1)
population['State_Abbreviation'] = population.apply(lambda row: split_county_state(row['GeoName'], county = False), axis = 1)
population[['County', 'State_Abbreviation', '2019', '2020', '2021', '2022']]
melted = population.melt(
    id_vars = ['County', 'State_Abbreviation'],
    value_vars = ['2019', '2020', '2021', '2022'],
    var_name = 'Year',
    value_name = 'Population'
)
melted['Year'] = melted['Year'].astype(int)
melted['County'] = melted['County'].replace('Saint Croix', 'St. Croix')
population = melted
population.head(10)

Unnamed: 0,County,State_Abbreviation,Year,Population
0,Autauga,AL,2019,58245
1,Baldwin,AL,2019,227079
2,Barbour,AL,2019,25205
3,Bibb,AL,2019,22405
4,Blount,AL,2019,58956
5,Bullock,AL,2019,10455
6,Butler,AL,2019,19097
7,Calhoun,AL,2019,116669
8,Chambers,AL,2019,34914
9,Cherokee,AL,2019,25061


# Add Income and Population data to Skeleton

### Income

In [116]:
merged = skeleton.merge(
    incomes,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_incomes')
)

merged['Income'] = merged['Income'].fillna(merged['Income_incomes'])
merged = merged.drop(columns=['Income_incomes'])
skeleton = merged
skeleton

Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income,Population
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,38099,
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,40931,
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,33.333333,44439,
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,72.727273,45136,
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,61.290323,48131,
...,...,...,...,...,...,...,...,...
975,Ashland,WI,55003,"MULTIPOLYGON (((-90.46546 47.00259, -90.45745 ...",2022,43.750000,46014,
976,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2019,,37833,
977,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2020,,39904,
978,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2021,,40385,


### Population

In [117]:
merged = skeleton.merge(
    population,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_populations')
)

merged['Population'] = merged['Population'].fillna(merged['Population_populations'])
merged = merged.drop(columns=['Population_populations'])
skeleton = merged
skeleton

Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income,Population
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,38099,29534
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,40931,30037
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,33.333333,44439,30579
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,72.727273,45136,31337
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,61.290323,48131,20596
...,...,...,...,...,...,...,...,...
975,Ashland,WI,55003,"MULTIPOLYGON (((-90.46546 47.00259, -90.45745 ...",2022,43.750000,46014,16039
976,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2019,,37833,2353
977,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2020,,39904,2352
978,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2021,,40385,2356


## Add to/Clean the Skeleton

In [118]:
skeleton['County_State'] = skeleton['County'] + ', ' + skeleton['State_Abbreviation']
skeleton = skeleton[['County', 'State_Abbreviation', 'Year', 'PassRate', 'Income', 'Population', 'County_State', 'GEOID', 'geometry']]
skeleton.sample(10)

Unnamed: 0,County,State_Abbreviation,Year,PassRate,Income,Population,County_State,GEOID,geometry
188,Newton,GA,2019,35.176282,34978,111445,"Newton, GA",13217,"POLYGON ((-84.04449 33.52578, -84.03212 33.539..."
689,Terrell,GA,2020,,42725,9138,"Terrell, GA",13273,"POLYGON ((-84.59978 31.92017, -84.45331 31.919..."
839,Brooks,GA,2022,15.662651,39500,16253,"Brooks, GA",13027,"POLYGON ((-83.74226 30.74204, -83.74082 30.818..."
605,Langlade,WI,2020,30.15873,47949,19462,"Langlade, WI",55067,"POLYGON ((-89.42482 45.46717, -89.36405 45.469..."
967,Harris,GA,2022,69.781931,58675,36276,"Harris, GA",13145,"POLYGON ((-85.18579 32.87027, -85.10949 32.870..."
787,Paulding,GA,2022,61.964736,48143,178421,"Paulding, GA",13223,"POLYGON ((-85.04983 33.95264, -84.97868 33.951..."
113,Polk,GA,2020,33.125,36460,42923,"Polk, GA",13233,"POLYGON ((-85.42188 34.08082, -85.28332 34.079..."
449,Worth,GA,2020,15.625,38257,20738,"Worth, GA",13321,"POLYGON ((-84.03234 31.71677, -84.00762 31.735..."
236,McIntosh,GA,2019,,39985,11204,"McIntosh, GA",13191,"POLYGON ((-81.66321 31.53867, -81.62222 31.551..."
541,Sumter,GA,2020,29.032258,40077,29516,"Sumter, GA",13261,"POLYGON ((-84.4438 31.96898, -84.43301 32.0419..."


In [119]:
skeleton.to_csv('States_Counties_Map_Data.csv', index = False)