# Preparing County-Level Choropleth Map Data

To streamlit the Streamlit app's computations, we prepare the static data involved in the choropleth map.

In [1]:
import os
import pandas as pd
import numpy as np
import math
from pathlib import Path
import geopandas as gpd
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from data_loaders import *
prefix = '../'

# The Order of Building the Choropleth Data
1. Set up skeleton: need all combinations of counties and years
2. Gather all AP performance data per state
3. Fill in the skeleton with the performance data for all three states individually
4. Fill in the skeleton with the income data for all three states individually
5. Output the filled in skeleton, with null values filling wherever they weren't filled before

## All GEOIDs

In [2]:
# Mapping of state abbreviations to FIPS codes
state_fips_codes = {
    'WI': '55',
    'MA': '25',
    'GA': '13'
}
states_of_interest = state_fips_codes.keys()
state_fips_of_interest = [state_fips_codes[state] for state in states_of_interest]

# Load and filter the shapefile
gdf = gpd.read_file('../data/county_shapes/cb_2018_us_county_5m.shp')
gdf_states = gdf[gdf['STATEFP'].isin(state_fips_of_interest)]
gdf_states.reset_index(inplace = True)
fips_to_state = { code: abbr for abbr, code in state_fips_codes.items() }
gdf_states['State_Abbreviation'] = gdf_states['STATEFP'].map(fips_to_state)
gdf_states['GEOID'] = gdf_states['GEOID'].astype(str)
gdf_states.rename(columns = {'NAME' : 'County'}, inplace = True)
gdf_states = gdf_states[['County', 'State_Abbreviation', 'GEOID', 'geometry']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_states.rename(columns = {'NAME' : 'County'}, inplace = True)


## Cross with all Years

In [3]:
years = [2019, 2020, 2021, 2022]
geoids = gdf_states['GEOID'].unique()
# skeleton = pd.DataFrame([(year, geoid) for year in years for geoid in geoids], columns=['Year', 'GEOID'])
skeleton = gdf_states.merge(pd.DataFrame({'Year': years}), how = 'cross')
skeleton['PassRate'] = None
skeleton['Income'] = None
skeleton.head()

Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,,
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,,
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,,


## Get AP Performance for each State

### Georgia

In [4]:
GA_raw = gimmeGA(prefix = prefix)
GA_raw.head()

Unnamed: 0,SCHOOL_DSTRCT_NM,TEST_CMPNT_TYP_NM,NUMBER_TESTS_TAKEN,NOTESTS_3ORHIGHER,Year,Pass Rate,County
0,Appling County,ALL Subjects,92.0,37.0,2019,0.402174,Appling
1,Appling County,Biology,12.0,7.0,2019,0.583333,Appling
2,Appling County,Eng. Literature & Comp,10.0,3.0,2019,0.3,Appling
3,Appling County,Geography: Human,30.0,7.0,2019,0.233333,Appling
4,Appling County,Psychology,23.0,13.0,2019,0.565217,Appling


In [5]:
GA_map_data = GA_raw[GA_raw['TEST_CMPNT_TYP_NM'] == 'ALL Subjects'][['County', 'Year', 'Pass Rate']]
GA_map_data.reset_index(inplace = True)
GA_map_data['State_Abbreviation'] = 'GA'
GA_map_data.rename(columns = {'Pass Rate' : 'PassRate'}, inplace = True)
GA_map_data = GA_map_data[['County', 'State_Abbreviation', 'Year', 'PassRate']]
GA_map_data['PassRate'] *= 100

# Change some problematic names
GA_map_data['County'] = GA_map_data['County'].replace('Savannah-Chatham', 'Chatham')
GA_map_data['County'] = GA_map_data['County'].replace('Griffin-Spalding', 'Spalding')
GA_map_data['County'] = GA_map_data['County'].replace('Thomaston-Upson', 'Upson')
GA_map_data

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Appling,GA,2019,40.217391
1,Atkinson,GA,2019,21.621622
2,Baldwin,GA,2019,28.260870
3,Banks,GA,2019,43.243243
4,Barrow,GA,2019,46.895641
...,...,...,...,...
445,Worth,GA,2022,35.135135
446,Fulton,GA,2019,64.797958
447,Fulton,GA,2020,50.884339
448,Fulton,GA,2021,46.999291


### Georgia

In [6]:
GA_raw = gimmeGA(prefix = prefix)
GA_raw.head()

Unnamed: 0,SCHOOL_DSTRCT_NM,TEST_CMPNT_TYP_NM,NUMBER_TESTS_TAKEN,NOTESTS_3ORHIGHER,Year,Pass Rate,County
0,Appling County,ALL Subjects,92.0,37.0,2019,0.402174,Appling
1,Appling County,Biology,12.0,7.0,2019,0.583333,Appling
2,Appling County,Eng. Literature & Comp,10.0,3.0,2019,0.3,Appling
3,Appling County,Geography: Human,30.0,7.0,2019,0.233333,Appling
4,Appling County,Psychology,23.0,13.0,2019,0.565217,Appling


In [7]:
GA_map_data = GA_raw[GA_raw['TEST_CMPNT_TYP_NM'] == 'ALL Subjects'][['County', 'Year', 'Pass Rate']]
GA_map_data.reset_index(inplace = True)
GA_map_data['State_Abbreviation'] = 'GA'
GA_map_data.rename(columns = {'Pass Rate' : 'PassRate'}, inplace = True)
GA_map_data = GA_map_data[['County', 'State_Abbreviation', 'Year', 'PassRate']]
GA_map_data['PassRate'] *= 100
GA_map_data

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Appling,GA,2019,40.217391
1,Atkinson,GA,2019,21.621622
2,Baldwin,GA,2019,28.260870
3,Banks,GA,2019,43.243243
4,Barrow,GA,2019,46.895641
...,...,...,...,...
445,Worth,GA,2022,35.135135
446,Fulton,GA,2019,64.797958
447,Fulton,GA,2020,50.884339
448,Fulton,GA,2021,46.999291


### Massachusetts

In [8]:
# MA_map_data

### Wisconsin

In [9]:
WI_raw = pd.read_csv('../data/Wisconsin/Wisconsin_combined.csv')
WI_raw['State_Abbreviation'] = 'WI'
rename_years = {
    '2018-19' : 2018, 
    '2019-20' : 2019, 
    '2020-21' : 2020, 
    '2021-22' : 2021, 
    '2022-23' : 2022
}
WI_raw['Year'] = WI_raw.apply(lambda row : rename_years[row['Year']], axis = 1)
WI_raw.rename(columns = {
    'COUNTY' : 'County',
    'PERCENT_3_OR_ABOVE' : 'PassRate'
    }, inplace = True)

WI_map_data = WI_raw[['County', 'State_Abbreviation', 'Year', 'PassRate']]
WI_map_data['County'] = WI_map_data['County'].replace('Saint Croix', 'St. Croix')
WI_map_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WI_map_data['County'] = WI_map_data['County'].replace('Saint Croix', 'St. Croix')


Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Adams,WI,2018,28.947368
1,Ashland,WI,2018,62.500000
2,Barron,WI,2018,51.530612
3,Bayfield,WI,2018,64.285714
4,Brown,WI,2018,69.745411
...,...,...,...,...
344,Waukesha,WI,2022,75.070348
345,Waupaca,WI,2022,52.826087
346,Waushara,WI,2022,42.222222
347,Winnebago,WI,2022,69.518717


In [10]:
WI_map_data

Unnamed: 0,County,State_Abbreviation,Year,PassRate
0,Adams,WI,2018,28.947368
1,Ashland,WI,2018,62.500000
2,Barron,WI,2018,51.530612
3,Bayfield,WI,2018,64.285714
4,Brown,WI,2018,69.745411
...,...,...,...,...
344,Waukesha,WI,2022,75.070348
345,Waupaca,WI,2022,52.826087
346,Waushara,WI,2022,42.222222
347,Winnebago,WI,2022,69.518717


## Add AP Performance to Skeleton

In [11]:
merged = skeleton.merge(
    GA_map_data,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_GA')
)
merged = merged.merge(
    WI_map_data,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_WI')
)
merged['PassRate'] = merged['PassRate'].fillna(merged['PassRate_GA'])
merged['PassRate'] = merged['PassRate'].fillna(merged['PassRate_WI'])
merged = merged.drop(columns=['PassRate_GA', 'PassRate_WI'])
skeleton = merged
skeleton.head()

  merged['PassRate'] = merged['PassRate'].fillna(merged['PassRate_GA'])


Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,33.333333,
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,72.727273,
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,61.290323,


## Get Income Data

In [12]:
incomes = gimmeCountyIncomes(prefix = prefix)
incomes = incomes[incomes['State_Abbreviation'].isin(states_of_interest)]
melted = incomes.melt(
    id_vars = ['County', 'State_Abbreviation'],
    value_vars = ['2018', '2019', '2020', '2021', '2022'],
    var_name = 'Year',
    value_name = 'Income'
)
melted['Year'] = melted['Year'].astype(int)
melted['County'] = melted['County'].replace('Saint Croix', 'St. Croix')
incomes = melted
incomes.head()

Unnamed: 0,County,State_Abbreviation,Year,Income
0,Appling,GA,2018,33662
1,Atkinson,GA,2018,29212
2,Bacon,GA,2018,31302
3,Baker,GA,2018,37763
4,Baldwin,GA,2018,33125


# Add Income data to Skeleton

In [13]:
merged = skeleton.merge(
    incomes,
    on = ['County', 'State_Abbreviation', 'Year'],
    how = 'left',
    suffixes = ('', '_incomes')
)
merged['Income'] = merged['Income'].fillna(merged['Income_incomes'])
merged = merged.drop(columns=['Income_incomes'])
merged.head()
skeleton = merged
skeleton


Unnamed: 0,County,State_Abbreviation,GEOID,geometry,Year,PassRate,Income
0,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2019,,38099
1,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2020,,40931
2,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2021,33.333333,44439
3,Haralson,GA,13143,"POLYGON ((-85.38651 33.90172, -85.21279 33.899...",2022,72.727273,45136
4,Kewaunee,WI,55061,"POLYGON ((-87.76532 44.4145, -87.76238 44.6445...",2019,61.290323,48131
...,...,...,...,...,...,...,...
975,Ashland,WI,55003,"MULTIPOLYGON (((-90.46546 47.00259, -90.45745 ...",2022,43.750000,46014
976,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2019,,37833
977,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2020,,39904
978,Webster,GA,13307,"POLYGON ((-84.65501 31.96153, -84.64536 31.961...",2021,,40385


## Add to/Clean the Skeleton

In [14]:
skeleton['County_State'] = skeleton['County'] + ', ' + skeleton['State_Abbreviation']
skeleton = skeleton[['County', 'State_Abbreviation', 'Year', 'PassRate', 'Income', 'County_State', 'GEOID', 'geometry']]
skeleton.sample(10)

Unnamed: 0,County,State_Abbreviation,Year,PassRate,Income,County_State,GEOID,geometry
69,Atkinson,GA,2020,25.0,32289,"Atkinson, GA",13003,"POLYGON ((-83.14048 31.4204, -82.95852 31.4172..."
278,Dooly,GA,2021,,44318,"Dooly, GA",13093,"POLYGON ((-84.0276 32.17116, -84.03109 32.1788..."
442,Peach,GA,2021,23.529412,44734,"Peach, GA",13225,"POLYGON ((-84.01661 32.51174, -84.00849 32.521..."
709,Oglethorpe,GA,2020,49.473684,40391,"Oglethorpe, GA",13221,"POLYGON ((-83.30662 33.81144, -83.27593 33.847..."
717,Quitman,GA,2020,,39618,"Quitman, GA",13239,"POLYGON ((-85.14183 31.83926, -85.12984 31.883..."
326,White,GA,2021,64.897959,47944,"White, GA",13311,"POLYGON ((-83.87441 34.67489, -83.86361 34.688..."
109,Price,WI,2020,64.179104,47741,"Price, WI",55099,"POLYGON ((-90.67798 45.49363, -90.67875 45.638..."
75,Berrien,GA,2022,55.913978,39871,"Berrien, GA",13019,"POLYGON ((-83.4325 31.3328, -83.43449 31.35036..."
523,Pickens,GA,2022,42.78607,60616,"Pickens, GA",13227,"POLYGON ((-84.65437 34.54895, -84.52139 34.550..."
674,Miller,GA,2021,,52357,"Miller, GA",13201,"POLYGON ((-84.91742 31.25599, -84.69424 31.251..."


In [15]:
skeleton.to_csv('States_Counties_Map_Data.csv', index = False)