# CDC Vaccine Data Pipeline

This notebook extracts vaccination provider locations and flu coverage data from CDC APIs,
performs data cleaning and geographic joining, and saves the combined dataset for analysis.

Author: Nathaniel Pearson

Date: August 2025


In [1]:
import time
import json
import warnings
from typing import Dict, List, Optional

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import censusdata
import us

warnings.filterwarnings('ignore')

In [2]:
# Loading data from CDC Socrata API with paginatio
def load_cdc_data(url, limit=50000):
    all_data = []
    offset = 0
    
    print(f"Loading data from: {url}")
    
    while True:
        params = {
            "$limit": limit,
            "$offset": offset,
            "$order": ":id"
        }
        
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise error if request fails
        batch_data = response.json()
        if not batch_data:
            break
        all_data.extend(batch_data)
        print(f"  Loaded {len(batch_data)} records (total: {len(all_data)})")
        if len(batch_data) < limit:
            break
        offset += limit
        time.sleep(1)
    
    print(f"Total records: {len(all_data)}")
    
    df = pd.DataFrame(all_data)
    return df


# Loading datasets
provider_url = "https://data.cdc.gov/resource/bugr-bbfr.json"
coverage_url = "https://data.cdc.gov/resource/vh55-3he6.json"
providers_df = load_cdc_data(provider_url)
coverage_df = load_cdc_data(coverage_url)

print(f"\nProviders dataset: {providers_df.shape}")
print(f"Coverage dataset: {coverage_df.shape}")
print("\nProviders columns:")
print(providers_df.columns.tolist())
print("\nCoverage columns:")
print(coverage_df.columns.tolist())

Loading data from: https://data.cdc.gov/resource/bugr-bbfr.json
  Loaded 50000 records (total: 50000)
  Loaded 50000 records (total: 100000)
  Loaded 50000 records (total: 150000)
  Loaded 50000 records (total: 200000)
  Loaded 2652 records (total: 202652)
Total records: 202652
Loading data from: https://data.cdc.gov/resource/vh55-3he6.json
  Loaded 50000 records (total: 50000)
  Loaded 50000 records (total: 100000)
  Loaded 50000 records (total: 150000)
  Loaded 50000 records (total: 200000)
  Loaded 20729 records (total: 220729)
Total records: 220729

Providers dataset: (202652, 28)
Coverage dataset: (220729, 11)

Providers columns:
['provider_location_guid', 'loc_store_no', 'loc_phone', 'loc_name', 'loc_admin_street1', 'loc_admin_street2', 'loc_admin_city', 'loc_admin_state', 'loc_admin_zip', 'sunday_hours', 'monday_hours', 'tuesday_hours', 'wednesday_hours', 'thursday_hours', 'friday_hours', 'saturday_hours', 'insurance_accepted', 'walkins_accepted', 'provider_notes', 'searchable_n

In [3]:
coverage_df

Unnamed: 0,vaccine,geography_type,geography,fips,year_season,month,dimension_type,dimension,coverage_estimate,_95_ci,population_sample_size
0,Seasonal Influenza,Counties,New Haven,09009,2018,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,
1,Seasonal Influenza,Counties,New Haven,09009,2021,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,
2,Seasonal Influenza,Counties,New Haven,09009,2020,1,Age,>=18 Years,52.4,50.6 to 54.3,
3,Seasonal Influenza,Counties,New Haven,09009,2021,1,Age,>=18 Years,50.2,45.4 to 55.8,
4,Seasonal Influenza,Counties,New Haven,09009,2018,1,Age,>=18 Years,34.0,32.6 to 35.5,
...,...,...,...,...,...,...,...,...,...,...,...
220724,Seasonal Influenza,States/Local Areas,Maine,23,2017-18,1,>=65 Years,Non-Medical Setting,41.4,36.8 to 46.0,
220725,Seasonal Influenza,States/Local Areas,Maine,23,2020-21,1,>=65 Years,Non-Medical Setting,30.0,26.6 to 33.4,2064
220726,Seasonal Influenza,States/Local Areas,Maine,23,2017-18,1,18-64 Years,Non-Medical Setting,45.6,39.7 to 51.5,
220727,Seasonal Influenza,States/Local Areas,Maine,23,2020-21,1,18-64 Years,Non-Medical Setting,33.7,29.7 to 37.7,2772


In [4]:
providers_df

Unnamed: 0,provider_location_guid,loc_store_no,loc_phone,loc_name,loc_admin_street1,loc_admin_street2,loc_admin_city,loc_admin_state,loc_admin_zip,sunday_hours,...,provider_notes,searchable_name,in_stock,supply_level,quantity_last_updated,latitude,longitude,category,web_address,pre_screen
0,043d87df-3ea9-406a-9d07-a7c28470affc,Not applicable,(609) 654-6884,MEDFORD PHARMACY,639 STOKES RD,SUITE 101,MEDFORD,NJ,08055-3003,Closed,...,We also provide other immunizations as well in...,Flu Shot,False,-1,2023-10-05,39.871761,-74.810930,seasonal,,
1,5a1e2092-4979-4e7e-900a-3c69c8cc2484,MS1001798,(248) 516-1888,CARE PLUS PHARMACY #MS1001798,25290 GREENFIELD RD,,OAK PARK,MI,48237,Closed,...,Our staff is available for off-site clinics an...,Flu Shot (Egg free),False,3,2023-11-07,42.476380,-83.200964,seasonal,,
2,5a1e2092-4979-4e7e-900a-3c69c8cc2484,MS1001798,(248) 516-1888,CARE PLUS PHARMACY #MS1001798,25290 GREENFIELD RD,,OAK PARK,MI,48237,Closed,...,Our staff is available for off-site clinics an...,Flu Shot,True,1,2023-11-07,42.476380,-83.200964,seasonal,,
3,3247ee5c-e3d5-429a-bef2-1b62f1eaba98,2799,904-370-8749,Pathstone #2799,"5244 Edgewood Ct, Suite 2",,Jacksonville,FL,32254,Closed,...,,"Flu Shot (65+, high-dose or adjuvanted)",False,0,2023-11-09,30.324365,-81.737763,seasonal,,
4,3247ee5c-e3d5-429a-bef2-1b62f1eaba98,2799,904-370-8749,Pathstone #2799,"5244 Edgewood Ct, Suite 2",,Jacksonville,FL,32254,Closed,...,,"Flu Shot (65+, high-dose or adjuvanted)",True,0,2023-11-09,30.324365,-81.737763,seasonal,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202647,c9be709c-6e98-4d5c-8346-55350022ff1c,7915,(973) 962-0490,"CVS Pharmacy, Inc. #7915",115 SKYLINE DRIVE,,RINGWOOD,NJ,7456,10:00 AM - 5:00 PM,...,It is highly recommended to make an appointmen...,"Flu Shot (65+, high-dose or adjuvanted)",False,0,2024-08-01,41.084997,-74.263320,seasonal,{'url': 'https://www.cvs.com/store-locator/det...,{'url': 'https://www.cvs.com/immunizations/get...
202648,c9be709c-6e98-4d5c-8346-55350022ff1c,7915,(973) 962-0490,"CVS Pharmacy, Inc. #7915",115 SKYLINE DRIVE,,RINGWOOD,NJ,7456,10:00 AM - 5:00 PM,...,It is highly recommended to make an appointmen...,"Flu Shot (65+, high-dose or adjuvanted)",False,0,2024-08-01,41.084997,-74.263320,seasonal,{'url': 'https://www.cvs.com/store-locator/det...,{'url': 'https://www.cvs.com/immunizations/get...
202649,7cb1b873-8dd3-4d97-af17-eafc17341c7f,Oct-88,7178387761.0,Walmart Inc #Oct-88,100 N Londonderry Sq,,Palmyra,PA,17078-3904,10:00 AM - 6:00 PM,...,,Flu Shot,False,0,2024-08-01,40.322875,-76.562731,seasonal,{'url': 'https://www.walmart.com/store/2888'},{'url': 'https://www.walmart.com/pharmacy/clin...
202650,5d3f3c55-a8fb-451b-9bb5-046d118684b8,10123,(443) 602-7628,"CVS Pharmacy, Inc. #10123",101 N WOLFE ST.,,BALTIMORE,MD,21231,11:00 AM - 5:00 PM,...,It is highly recommended to make an appointmen...,Flu Shot (Egg free),False,0,2024-08-01,39.293470,-76.589993,seasonal,{'url': 'https://www.cvs.com/store-locator/det...,{'url': 'https://www.cvs.com/immunizations/get...


In [5]:
providers_df.columns

Index(['provider_location_guid', 'loc_store_no', 'loc_phone', 'loc_name',
       'loc_admin_street1', 'loc_admin_street2', 'loc_admin_city',
       'loc_admin_state', 'loc_admin_zip', 'sunday_hours', 'monday_hours',
       'tuesday_hours', 'wednesday_hours', 'thursday_hours', 'friday_hours',
       'saturday_hours', 'insurance_accepted', 'walkins_accepted',
       'provider_notes', 'searchable_name', 'in_stock', 'supply_level',
       'quantity_last_updated', 'latitude', 'longitude', 'category',
       'web_address', 'pre_screen'],
      dtype='object')

# Data Manipulation

In [6]:
# Selecting useful columns from provders_df
joint_df = providers_df[['loc_name','loc_admin_city',"loc_admin_state", 
                        "loc_admin_zip", "insurance_accepted", "walkins_accepted", "supply_level", 
                        "latitude", "longitude",
                         'sunday_hours', 'monday_hours', 'tuesday_hours', 'wednesday_hours', 'thursday_hours', 'friday_hours', 'saturday_hours']]

In [7]:
# Selecting years 2023-2024 for coverage_df. Select 12th month to represent the end of the year
coverage_df_2023_to_2024 = coverage_df[
    coverage_df['year_season'].str.startswith(('2023'))
]
coverage_df_2023_to_2024['month'] = pd.to_numeric(coverage_df_2023_to_2024['month'], errors='coerce')
coverage_df_2023_to_2024 = coverage_df_2023_to_2024[coverage_df_2023_to_2024['month'] == 12]

In [8]:
# Calculating for the total operation hours based on weekday opening times
from datetime import datetime, timedelta

def parse_hours(hours_str):
    if pd.isna(hours_str) or str(hours_str).strip().lower() == 'closed':
        return 0.0
    try:
        open_time_str, close_time_str = [t.strip() for t in hours_str.split('-')]
        open_time = datetime.strptime(open_time_str, '%I:%M %p')
        close_time = datetime.strptime(close_time_str, '%I:%M %p')
        if close_time < open_time:
            close_time += timedelta(days=1)
        duration = (close_time - open_time).seconds / 3600
        return duration
    except Exception:
        return 0.0

days = ['sunday_hours', 'monday_hours', 'tuesday_hours', 'wednesday_hours',
        'thursday_hours', 'friday_hours', 'saturday_hours']

joint_df['weekly_hours_open'] = joint_df[days].applymap(parse_hours).sum(axis=1)
joint_df = joint_df.drop(columns=days)

In [9]:
# Adding county to joint_df
zip_county_df = pd.read_csv('data/zipcode_county.csv')
joint_df['zip5'] = joint_df['loc_admin_zip'].str[:5]
zip_county_df['ZIP'] = zip_county_df['ZIP'].astype(str).str.zfill(5)

joint_df = joint_df.merge(zip_county_df[['ZIP', 'COUNTYNAME','STCOUNTYFP']], left_on='zip5', right_on='ZIP', how='left')

In [10]:
# calculating average coverage by state and merge into joint_df
abbr_to_name = {state.abbr: state.name for state in us.states.STATES}
joint_df['loc_admin_state'] = joint_df['loc_admin_state'].map(abbr_to_name)

coverage_df_2023_to_2024['coverage_estimate'] = pd.to_numeric(
    coverage_df_2023_to_2024['coverage_estimate'].str.extract(r'([\d\.]+)')[0], errors='coerce'
)

state_avg = coverage_df_2023_to_2024.groupby('geography')['coverage_estimate'].mean().reset_index()
state_avg.rename(columns={'coverage_estimate': 'state_coverage'}, inplace=True)
joint_df = joint_df.merge(state_avg, left_on='loc_admin_state', right_on='geography', how='left')
joint_df.drop(columns=['geography'], inplace=True)
joint_df.rename(columns={'ZIP': 'zip', 'COUNTYNAME': 'county_name'}, inplace=True)

In [11]:
# Filtering data to state-only. Calculating the mean coverage by state and dimension
coverage_df_2023_to_2024['coverage_estimate'] = pd.to_numeric(
    coverage_df_2023_to_2024['coverage_estimate'], errors='coerce'
)
state_data = coverage_df_2023_to_2024[coverage_df_2023_to_2024['geography_type'] == 'States/Local Areas']
grouped = state_data.groupby(['geography', 'dimension'])['coverage_estimate'].mean().reset_index()
pivot_df = grouped.pivot(index='geography', columns='dimension', values='coverage_estimate').reset_index()
pivot_df.columns = ['loc_admin_state'] + ['s.' + col.lower().replace(' ', '_').replace(',', '').replace('-', '_') for col in pivot_df.columns[1:]]

# Merge the dimension averages into joint_df by state
joint_df = joint_df.merge(pivot_df, on='loc_admin_state', how='left')

In [12]:
joint_df.to_csv('joint_df_export.csv', index=False)

# County Calculations

In [70]:
county_df = coverage_df[coverage_df['geography_type'] == 'Counties']
county_df['year_season'] = 2022

In [71]:
county_df

Unnamed: 0,vaccine,geography_type,geography,fips,year_season,month,dimension_type,dimension,coverage_estimate,_95_ci,population_sample_size
0,Seasonal Influenza,Counties,New Haven,09009,2022,1,>=18 Years,Non-Medical Setting,45.5,43.9 to 47.2,
1,Seasonal Influenza,Counties,New Haven,09009,2022,1,>=18 Years,Non-Medical Setting,53.0,46.0 to 60.9,
2,Seasonal Influenza,Counties,New Haven,09009,2022,1,Age,>=18 Years,52.4,50.6 to 54.3,
3,Seasonal Influenza,Counties,New Haven,09009,2022,1,Age,>=18 Years,50.2,45.4 to 55.8,
4,Seasonal Influenza,Counties,New Haven,09009,2022,1,Age,>=18 Years,34.0,32.6 to 35.5,
...,...,...,...,...,...,...,...,...,...,...,...
220580,Seasonal Influenza,Counties,Mahoning,39099,2022,1,Age,>=18 Years,34.2,32.6 to 36.0,
220581,Seasonal Influenza,Counties,Mahoning,39099,2022,1,Age,>=18 Years,39.9,37.9 to 42.0,
220582,Seasonal Influenza,Counties,Mahoning,39099,2022,1,Age,>=18 Years,43.7,38.2 to 49.4,
220583,Seasonal Influenza,Counties,Mahoning,39099,2022,1,Age,>=18 Years,41.1,39.3 to 42.8,


In [72]:
# group fips for each county to calculate coverage estimate
county_df['coverage_estimate'] = pd.to_numeric(county_df['coverage_estimate'], errors='coerce')
avg_county_coverage = county_df.groupby('fips')['coverage_estimate'].mean().reset_index()

In [73]:
avg_county_coverage

Unnamed: 0,fips,coverage_estimate
0,01001,42.871429
1,01003,45.171429
2,01005,35.142857
3,01007,40.314286
4,01009,41.685714
...,...,...
3148,56037,38.428571
3149,56039,41.100000
3150,56041,36.957143
3151,56043,37.914286


In [74]:
# creation of columns for what was originally dimensions ">= 18 years" and "non-medical setting"
avg_county_coverage_by_dim = county_df.groupby(['fips', 'dimension'])['coverage_estimate'].mean().reset_index()
df_pivot = avg_county_coverage_by_dim.pivot(index='fips', columns='dimension', values='coverage_estimate').reset_index()

df_pivot = avg_county_coverage.merge(
    df_pivot,
    on='fips',
    how='left'
)


In [75]:
df_pivot

Unnamed: 0,fips,coverage_estimate,>=18 Years,Non-Medical Setting
0,01001,42.871429,41.30,46.80
1,01003,45.171429,41.26,54.95
2,01005,35.142857,34.80,36.00
3,01007,40.314286,37.24,48.00
4,01009,41.685714,39.84,46.30
...,...,...,...,...
3148,56037,38.428571,34.04,49.40
3149,56039,41.100000,40.42,42.80
3150,56041,36.957143,32.40,48.35
3151,56043,37.914286,36.52,41.40


In [76]:
# counting the number of vaccination locations by county. grouping by STCOUNTYFP or fips
county_counts = (
    joint_df.groupby(['STCOUNTYFP'])
    .size()
    .reset_index(name='location_count')
)

In [77]:
county_counts

Unnamed: 0,STCOUNTYFP,location_count
0,1001.0,62
1,1003.0,178
2,1005.0,11
3,1007.0,27
4,1009.0,89
...,...,...
2663,72143.0,20
2664,72145.0,26
2665,72151.0,5
2666,72153.0,5


In [78]:
# ACS 5-year county population estimates 
counties = censusdata.download(
    'acs5',
    2022,
    censusdata.censusgeo([('state', '*'), ('county', '*')]),
    ['B01003_001E']  # Total population
)

# Reset the index to extract state + county FIPS
counties = counties.reset_index()
counties[['state_fips', 'county_fips']] = counties['index'].apply(
    lambda x: pd.Series([x.params()[0][1], x.params()[1][1]])
)
counties = counties.drop(columns='index')

# Rename population column
counties = counties.rename(columns={'B01003_001E': 'population'})

# Load TIGER shapefile to get actual county names
counties_gdf = gpd.read_file(
    r"C:\Users\npear\Desktop\NE Quarter 1 Schoolwork\Projects\CDC Projects\Vaccine Project\tl_2024_us_county\tl_2024_us_county.shp"
)
counties_gdf = counties_gdf[['STATEFP', 'COUNTYFP', 'NAME','GEOID','geometry']]

# Merge population data with actual county names
counties = counties.merge(
    counties_gdf,
    left_on=['state_fips', 'county_fips'],
    right_on=['STATEFP', 'COUNTYFP'],
    how='left'
)

# Map state FIPS codes to state names
state_fips_to_name = {
    '01': 'Alabama', '02': 'Alaska', '04': 'Arizona', '05': 'Arkansas',
    '06': 'California', '08': 'Colorado', '09': 'Connecticut', '10': 'Delaware',
    '11': 'District of Columbia', '12': 'Florida', '13': 'Georgia', '15': 'Hawaii',
    '16': 'Idaho', '17': 'Illinois', '18': 'Indiana', '19': 'Iowa', '20': 'Kansas',
    '21': 'Kentucky', '22': 'Louisiana', '23': 'Maine', '24': 'Maryland',
    '25': 'Massachusetts', '26': 'Michigan', '27': 'Minnesota', '28': 'Mississippi',
    '29': 'Missouri', '30': 'Montana', '31': 'Nebraska', '32': 'Nevada',
    '33': 'New Hampshire', '34': 'New Jersey', '35': 'New Mexico', '36': 'New York',
    '37': 'North Carolina', '38': 'North Dakota', '39': 'Ohio', '40': 'Oklahoma',
    '41': 'Oregon', '42': 'Pennsylvania', '44': 'Rhode Island', '45': 'South Carolina',
    '46': 'South Dakota', '47': 'Tennessee', '48': 'Texas', '49': 'Utah',
    '50': 'Vermont', '51': 'Virginia', '53': 'Washington', '54': 'West Virginia',
    '55': 'Wisconsin', '56': 'Wyoming'
}
counties['state'] = counties['state_fips'].map(state_fips_to_name)

# Keep only state, county name, and population
counties = counties[['state', 'NAME', 'population','state_fips','county_fips']].rename(columns={'NAME': 'county_name'})

In [79]:
# Building 5-digit county FIPS (aka GEOID / STCOUNTYFP)
counties['state_fips'] = counties['state_fips'].astype(str).str.zfill(2)
counties['county_fips'] = counties['county_fips'].astype(str).str.zfill(3)
counties['GEOID'] = counties['state_fips'] + counties['county_fips']

print("unique 3-digit county_fips:", counties['county_fips'].nunique())
print("unique 5-digit GEOID:", counties['GEOID'].nunique())

unique 3-digit county_fips: 330
unique 5-digit GEOID: 3222


In [80]:
# Merge using 5-digit code
counties = counties.merge(counties_gdf[['GEOID']], on='GEOID', how='left') \
                   .rename(columns={'NAME': 'county_name'})

In [81]:
counties

Unnamed: 0,state,county_name,population,state_fips,county_fips,GEOID
0,Alabama,Autauga,58761,01,001,01001
1,Alabama,Baldwin,233420,01,003,01003
2,Alabama,Barbour,24877,01,005,01005
3,Alabama,Bibb,22251,01,007,01007
4,Alabama,Blount,59077,01,009,01009
...,...,...,...,...,...,...
3217,,Vega Baja,54182,72,145,72145
3218,,Vieques,8199,72,147,72147
3219,,Villalba,21984,72,149,72149
3220,,Yabucoa,30313,72,151,72151


In [82]:
# Merging county_counts and counties. Effectively combining the counts of vaccine cites and population data
county_counts['STCOUNTYFP'] = county_counts['STCOUNTYFP'].astype(int).astype(str).str.zfill(5)

merged_df = counties.merge(
    county_counts,
    left_on='GEOID',       
    right_on='STCOUNTYFP',  
    how='right'             
)

In [83]:
merged_df

Unnamed: 0,state,county_name,population,state_fips,county_fips,GEOID,STCOUNTYFP,location_count
0,Alabama,Autauga,58761.0,01,001,01001,01001,62
1,Alabama,Baldwin,233420.0,01,003,01003,01003,178
2,Alabama,Barbour,24877.0,01,005,01005,01005,11
3,Alabama,Bibb,22251.0,01,007,01007,01007,27
4,Alabama,Blount,59077.0,01,009,01009,01009,89
...,...,...,...,...,...,...,...,...
2663,,Vega Alta,35279.0,72,143,72143,72143,20
2664,,Vega Baja,54182.0,72,145,72145,72145,26
2665,,Yabucoa,30313.0,72,151,72151,72151,5
2666,,Yauco,33988.0,72,153,72153,72153,5


In [84]:
# Merging vaccination estimates and df_merged
final_df = merged_df.merge(
    df_pivot,
    left_on='GEOID',
    right_on='fips',
    how='left'
)

In [85]:
final_df

Unnamed: 0,state,county_name,population,state_fips,county_fips,GEOID,STCOUNTYFP,location_count,fips,coverage_estimate,>=18 Years,Non-Medical Setting
0,Alabama,Autauga,58761.0,01,001,01001,01001,62,01001,42.871429,41.30,46.80
1,Alabama,Baldwin,233420.0,01,003,01003,01003,178,01003,45.171429,41.26,54.95
2,Alabama,Barbour,24877.0,01,005,01005,01005,11,01005,35.142857,34.80,36.00
3,Alabama,Bibb,22251.0,01,007,01007,01007,27,01007,40.314286,37.24,48.00
4,Alabama,Blount,59077.0,01,009,01009,01009,89,01009,41.685714,39.84,46.30
...,...,...,...,...,...,...,...,...,...,...,...,...
2663,,Vega Alta,35279.0,72,143,72143,72143,20,,,,
2664,,Vega Baja,54182.0,72,145,72145,72145,26,,,,
2665,,Yabucoa,30313.0,72,151,72151,72151,5,,,,
2666,,Yauco,33988.0,72,153,72153,72153,5,,,,


In [86]:
# clean up the dataframe
final_df = final_df.drop(columns=['STCOUNTYFP','fips'])

In [87]:
final_df = final_df.dropna(axis=0, how='any')

In [88]:
# Reproject to metric CRS for area calculation
counties_gdf = counties_gdf.to_crs(epsg=3395)

# Compute area in sq km and sq miles for counties
counties_gdf['area_sqkm'] = counties_gdf.geometry.area / 10**6
counties_gdf['area_sqmiles'] = counties_gdf['area_sqkm'] * 0.386102

In [91]:
# Prepare counties_gdf for merging
# Keep only the necessary columns
counties_merge = counties_gdf[['GEOID', 'area_sqmiles', 'NAME']].copy()

# Strip whitespace from GEOID columns to avoid join mismatches
final_df['GEOID'] = final_df['GEOID'].astype(str).str.strip()
counties_merge['GEOID'] = counties_merge['GEOID'].astype(str).str.strip()

# Merge on GEOID (inner join)
final_df = final_df.merge(
    counties_merge,
    on='GEOID',
    how='inner'
)

# Compute locations per square mile
final_df['locations_per_sq_mile'] = final_df['location_count'] / final_df['area_sqmiles']

# Locations per 1000 people
final_df['location_per_1000'] = final_df['location_count'] / final_df['population'] * 1000

print(final_df[['NAME', 'location_count', 'area_sqmiles', 'locations_per_sq_mile']].head())

      NAME  location_count  area_sqmiles  locations_per_sq_mile
0  Autauga              62    848.674413               0.073055
1  Baldwin             178   2735.095697               0.065080
2  Barbour              11   1251.799013               0.008787
3     Bibb              27    888.447150               0.030390
4   Blount              89    944.239356               0.094256


In [92]:
final_df

Unnamed: 0,state,county_name,population,state_fips,county_fips,GEOID,location_count,coverage_estimate,>=18 Years,Non-Medical Setting,area_sqmiles,NAME,locations_per_sq_mile,location_per_1000
0,Alabama,Autauga,58761.0,01,001,01001,62,42.871429,41.30,46.80,848.674413,Autauga,0.073055,1.055122
1,Alabama,Baldwin,233420.0,01,003,01003,178,45.171429,41.26,54.95,2735.095697,Baldwin,0.065080,0.762574
2,Alabama,Barbour,24877.0,01,005,01005,11,35.142857,34.80,36.00,1251.799013,Barbour,0.008787,0.442176
3,Alabama,Bibb,22251.0,01,007,01007,27,40.314286,37.24,48.00,888.447150,Bibb,0.030390,1.213429
4,Alabama,Blount,59077.0,01,009,01009,89,41.685714,39.84,46.30,944.239356,Blount,0.094256,1.506508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2597,Wyoming,Park,29878.0,56,029,56029,22,41.885714,40.24,46.00,13656.704398,Park,0.001611,0.736328
2598,Wyoming,Sheridan,31176.0,56,033,56033,22,43.085714,39.78,51.35,4999.497410,Sheridan,0.004400,0.705671
2599,Wyoming,Sweetwater,42079.0,56,037,56037,29,38.428571,34.04,49.40,18741.386563,Sweetwater,0.001547,0.689180
2600,Wyoming,Teton,23346.0,56,039,56039,14,41.100000,40.42,42.80,8106.622746,Teton,0.001727,0.599674


In [94]:
final_df.to_csv('avg_county_coverage.csv', index=False)