<a href="https://colab.research.google.com/github/pandemic-tracking/viz-gen/blob/main/us_county_vax_demographics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import pytz

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from pathlib import Path

pd.set_option("display.precision", 4)

now_est = datetime.now().astimezone(pytz.timezone("US/Eastern"))

now_est_time = now_est.strftime("%Y-%m-%d, %H:%M:%S ET")
now_est_date = now_est.strftime("%Y-%m-%d")
now_est_timestamp = now_est.strftime("%Y%m%d_%H%M%S")
now_utc_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
print(now_est_time, now_est_date, now_est_timestamp, now_utc_timestamp)

In [None]:
# adapting from https://towardsdatascience.com/consistently-beautiful-visualizations-with-altair-themes-c7f9f889602

def ptc_theme():
    axisColor = "#808080"
    gridColor = "#DEDDDD"
    markColor = "#000000"
    font = 'Arial'
    labelFont = 'Arial'
    # Colors
    # main_palette = ["#1696d2", 
    #                 "#d2d2d2",
    #                 "#000000", 
    #                 "#fdbf11", 
    #                 "#ec008b", 
    #                 "#55b748", 
    #                 "#5c5859", 
    #                 "#db2b27", 
    #                ]
    # sequential_palette = ["#cfe8f3", 
    #                       "#a2d4ec", 
    #                       "#73bfe2", 
    #                       "#46abdb", 
    #                       "#1696d2", 
    #                       "#12719e", 
    #                      ]
    return {
          "config": {
              "title": {
                  "fontSize": 20,
                  "font": font,
                  "anchor": "start", # equivalent of left-aligned.
                  "fontColor": "#000000",
                  "fontWeight": "normal",
              },
              "text": {
                  "font": font,
                  "labelFont": labelFont,
              },
              "header": {
                  "font": font,
                  "labelFont": labelFont,
                  "titleFont": font,
              },
              "axisX": {
                  "domain": False,
                  "domainColor": axisColor,
                  "labelColor": axisColor,
                  "domainWidth": 1,
                  "grid": False,
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "labelAngle": 0, 
                  "tickColor": axisColor,
                  "tickSize": 5, # default, including it just to show you can change it
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "X Axis Title (units)", 
              },
              "axisY": {
                  "domain": False,
                  "grid": True,
                  "gridColor": gridColor,
                  "gridWidth": 1,
                  "labelFont": labelFont,
                  "labelColor": axisColor,
                  "labelFontSize": 12,
                  "labelAngle": 0,
                  "labelAnchor": "end",
                  "labelAlign": "right",  
                  "ticks": False, # even if you don't have a "domain" you need to turn these off.
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "Y Axis Title (units)", 
                  # titles are by default vertical left of axis so we need to hack this 
                  "titleAngle": 0, # horizontal
                  "titleY": -10, # move it up
                  "titleX": 18, # move it to the right so it aligns with the labels 
              },
            #   "range": {
            #       "category": main_palette,
            #       "diverging": sequential_palette,
            #   },
              "legend": {
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "symbolType": "circle", # just 'cause
                  "symbolSize": 100, # default
                  "titleFont": font,
                  "titleFontSize": 12,
                  "title": "", # set it to no-title by default
                  "orient": "right", # so it's right next to the y-axis
                  "offset": 0, # literally right next to the y-axis.
              },
              "view": {
                  "stroke": "transparent", # altair uses gridlines to box the area where the data is visualized. This takes that off.
              },
        }
    }

alt.themes.register("my_custom_theme", ptc_theme)
alt.themes.enable("my_custom_theme")

# Get CPR data

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

req = requests.get('https://www.healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9')
soup = BeautifulSoup(req.content, 'html.parser')
script = soup.find_all('script', type='text/javascript')[6]

In [None]:
re_results = re.findall(r'"name":"Community_Profile_Report_\d*_Public.xlsx","href":"\/api\/views\/gqxm-d9w9\/files\/\w*-\w*-\w*-\w*-\w*.*?download=true', str(script))

In [None]:
cpr_url_list = ['https://www.healthdata.gov'+s.split('"href":"')[-1]\
                +'&filename=CPR_'+s.split('Community_Profile_Report_')[-1][:8]\
                +'.xlsx' for s in sorted(re_results, reverse=True)]
cpr_url_list[:10]

In [None]:
cpr_url = cpr_url_list[0]
cpr_county_df = pd.read_excel(cpr_url, sheet_name='Counties', header=1)
cpr_county_df['fips_code'] = cpr_county_df['FIPS code'].astype(str).apply(lambda x: x.zfill(5))

cpr_state_df = pd.read_excel(cpr_url, sheet_name='States', header=1)

# County Plots

In [None]:
census_divisions_dict = {
'Northeast (New England)' : ['CT','ME','MA','NH','RI','VT'],
'Northeast (Mid-Atlantic)' : ['NJ','NY','PA'],
'Midwest (East North Central)' : ['IN','IL','MI','OH','WI'],
'Midwest (West North Central)' : ['IA','KS','MN','MO','NE','ND','SD'],
'South (South Atlantic)' : ['DE','MD','DC','VA','WV','NC','SC','GA','FL'],
'South (East South Central)' : ['AL','KY','MS','TN'],
'South (West South Central)' : ['AR','LA','OK','TX'],
'West (Mountain)' : ['AZ','CO','ID','NM','MT','UT','NV','WY'],
'West (Pacific)' : ['AK','CA','HI','OR','WA']
}

In [None]:
region_northeast = census_divisions_dict['Northeast (New England)'] + census_divisions_dict['Northeast (Mid-Atlantic)']
region_midwest = census_divisions_dict['Midwest (East North Central)'] + census_divisions_dict['Midwest (West North Central)']
region_south = census_divisions_dict['South (South Atlantic)'] + census_divisions_dict['South (East South Central)'] + census_divisions_dict['South (West South Central)']
region_west = census_divisions_dict['West (Mountain)'] + census_divisions_dict['West (Pacific)']

In [None]:
def assign_region(row, state_col='state'):
    if row[state_col] in region_northeast:
        return 'Northeast'
    elif row[state_col] in region_midwest: return 'Midwest'
    elif row[state_col] in region_south: return 'South'
    elif row[state_col] in region_west: return 'West'
    else: return 'Other'
    

In [None]:
county_source = cpr_county_df[['State Abbreviation','County','fips_code','SVI score','CCVI score','County type','FEMA region',
                               'Cases per 100k - last 7 days','Cases - last 7 days',
                                '% In Poverty','% Uninsured',
                               '% Non-Hispanic Black','People who are fully vaccinated as % of total population','Population']]

In [None]:
# exclude these for low reporting (and PR because it's not in the 4 census regions)
county_source = county_source[~county_source['State Abbreviation'].isin(['GA','WV','HI','VT','PR'])]

In [None]:
# exclude 3 counties in MA that are anomalously low in their % vax number
county_source = county_source[~county_source['fips_code'].isin(['25001','25019','25007'])]

In [None]:
from functools import partial

assign_reg = partial(assign_region, state_col='State Abbreviation')

county_source['region'] = county_source.apply(assign_reg, axis=1)

In [None]:
select_county = alt.selection_multi(fields=['fips_code'], empty='none', init=[{'fips_code':'22061'}])
select_type = alt.selection_multi(fields=['County type'], bind='legend')

def make_chart(y_col, format='.0%', scale_domain=[0, 0.5], legend=None):
  chart = alt.layer(
      alt.Chart(county_source).mark_rule(color='#DEDDDD').encode(
        x=alt.X('line:Q'),
      ).transform_calculate(
        line='.5')#.transform_filter(alt.datum.date=='22061')
      +
      alt.Chart(county_source).mark_point(clip=True, size=50, shape='circle', filled=True, stroke='black', opacity=0.5).encode(
        x=alt.X('People who are fully vaccinated as % of total population', 
                scale=alt.Scale(domain=[0.1,0.9], clamp=False),
                axis=alt.Axis(format='.0%', values=[.2,.5,.8,], ticks=False, grid=False), title=None),
        y=alt.Y(y_col, title=y_col, scale=alt.Scale(domain=scale_domain), 
                axis=alt.Axis(format=format, tickCount=3, ticks=False, grid=True), ),
        color=alt.condition(select_type, 
                            alt.Color('County type:N', legend=legend, 
                                      scale=alt.Scale(domain=['Large central metro','Large fringe metro','Medium metro','Small metro','Micropolitan','Non-core'],
                                                      range=['#3b00bc','#3b00bc','#ff9000','#ff9000','green','green']),
                                                        ),
                            alt.value('black')
                            ),
        size=alt.Size('Population', legend=legend, 
                      scale=alt.Scale(domain=[1,2.5e6], range=[10,150])),
        opacity=alt.condition(select_type, alt.value(.7), alt.value(0.2)),
        strokeWidth=alt.condition(select_county, alt.value(2), alt.value(0.)),      
        strokeOpacity=alt.value(1),
        tooltip=['County','CCVI score','SVI score','% In Poverty', 'County type', 'Cases per 100k - last 7 days','People who are fully vaccinated as % of total population']
      )
    ).properties(height=250, width=250)

  return chart

county_chart = make_chart('% In Poverty').add_selection(select_county).add_selection(select_type).facet(alt.Facet('region', sort=['Northeast','South','Midwest','West'], title=None), columns=4)

county_chart2 = make_chart('% Uninsured').add_selection(select_county).add_selection(select_type).facet(alt.Facet('region', sort=['Northeast','South','Midwest','West'], title=None), columns=4)

county_chart3 = make_chart('CCVI score', '.1f', scale_domain=[0,1]).add_selection(select_county).add_selection(select_type).facet(alt.Facet('region', sort=['Northeast','South','Midwest','West'], title=None), columns=4)

county_chart4 = make_chart('SVI score', scale_domain=[0,1]).add_selection(select_county).add_selection(select_type).facet(alt.Facet('region', sort=['Northeast','South','Midwest','West'], title=None), columns=4)

county_chart5 = make_chart('% Non-Hispanic Black', scale_domain=[0,1]).add_selection(select_county).add_selection(select_type).facet(alt.Facet('region', sort=['Northeast','South','Midwest','West'], title=None), columns=4)

county_chart

In [None]:
county_chart2

In [None]:
county_chart3

In [None]:
county_chart4

In [None]:
county_chart5

In [None]:
viz_pub = alt.vconcat(county_chart, county_chart3).properties(title=['Fully Vaccinated % of U.S. Counties by Poverty and Vulnerability Levels','']).configure_axisY(titleColor='grey', titleX=-10)
viz_pub

In [None]:
county_source[county_source['fips_code'] == '22061']

# Save to Drive

In [None]:
# this is for saving altair charts to png and svg, based on https://colab.research.google.com/github/altair-viz/altair_saver/blob/master/AltairSaver.ipynb#scrollTo=ZiTDBCAM_Ni8
!pip install -q altair_saver
!npm install --silent vega-lite vega-cli canvas

In [None]:
from pathlib import Path
from altair_saver import save

SAVE_PATH = Path('assets')
SAVE_PATH.mkdir(exist_ok=True)

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def assets_to_gdrive(folder_name, localdir_path = SAVE_PATH, parentdir_id='17Kx2uZbQv1r5U1M9x_OXS4lpMU5c6Ym8'):
  # search gdrive for snapshot folder and save assets there if it already exists. 
  folder_id = ''
  file_list = gdrive.ListFile({'q': f"'{parentdir_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed=false"}).GetList()
  for file1 in file_list:
      if file1['title'] == folder_name: 
        folder_id = file1['id']
        print(f'Found pre-existing gdrive folder named "{folder_name}" at',folder_id)
  # if not, create new folder
  if folder_id == '':
    folder = gdrive.CreateFile(metadata={'title': folder_name,
                                      'parents':[{'id': parentdir_id}],
                                      "mimeType": "application/vnd.google-apps.folder"
                                      })
    folder.Upload()
    folder_id = folder.get('id')
    print(f'Created new gdrive folder named "{folder_name}" at',folder_id)
  
  # upload all files within SAVE_PATH to snapshot folder
  for asset_file in localdir_path.iterdir():
    file_path = localdir_path/asset_file.name
    file1 = gdrive.CreateFile(metadata={'title':asset_file.name,
                                        'parents':[{'id': folder_id}],
                                        })
    file1.SetContentFile(file_path)
    file1.Upload()
    print('Saved file: ',asset_file.name)


In [None]:
def save_vizassets(chart, save_path, filename, fmts=['html','json','png','svg','pdf']):
  for fmt in fmts:
    save(chart, f'{save_path}/{filename}.{fmt}')

In [None]:
# put your stuff (i.e. dataframes, altair charts, input data files) to save here

In [None]:
county_source.to_csv(SAVE_PATH/'county_source.csv')
cpr_county_df.to_csv(SAVE_PATH/'cpr_county_df.csv')
save_vizassets(viz_pub, SAVE_PATH, f'us-county-vax-demographics_{now_utc_timestamp}')

In [None]:
# get the colab filename
from requests import get
nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].replace('.ipynb','')
nb_id = get('http://172.28.0.2:9000/api/sessions').json()[0]['notebook']['path'].replace('fileId=','')

print(SAVE_PATH, nb_name, now_utc_timestamp, nb_id)

# create a snapshot of this currently running notebook and save to SAVE_PATH
downloaded_nb = gdrive.CreateFile({'id':nb_id})   # replace the id with id of file you want to access
downloaded_nb.GetContentFile(SAVE_PATH/f'{nb_name}_{now_utc_timestamp}.ipynb')

In [None]:
# upload everything to gdrive
assets_to_gdrive(folder_name=f'{nb_name}_{now_utc_timestamp}')