In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import pytz

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from pathlib import Path

pd.set_option("display.precision", 4)

now_est = datetime.now().astimezone(pytz.timezone("US/Eastern"))

now_est_time = now_est.strftime("%Y-%m-%d, %H:%M:%S ET")
now_est_date = now_est.strftime("%Y-%m-%d")
now_est_timestamp = now_est.strftime("%Y%m%d_%H%M%S")
now_utc_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
print(now_est_time, now_est_date, now_est_timestamp, now_utc_timestamp)

In [None]:
# adapting from https://towardsdatascience.com/consistently-beautiful-visualizations-with-altair-themes-c7f9f889602

def ptc_theme():
    axisColor = "#808080"
    gridColor = "#DEDDDD"
    markColor = "#000000"
    font = 'Arial'
    labelFont = 'Arial'
    # Colors
    # main_palette = ["#1696d2", 
    #                 "#d2d2d2",
    #                 "#000000", 
    #                 "#fdbf11", 
    #                 "#ec008b", 
    #                 "#55b748", 
    #                 "#5c5859", 
    #                 "#db2b27", 
    #                ]
    # sequential_palette = ["#cfe8f3", 
    #                       "#a2d4ec", 
    #                       "#73bfe2", 
    #                       "#46abdb", 
    #                       "#1696d2", 
    #                       "#12719e", 
    #                      ]
    return {
          "config": {
              "title": {
                  "fontSize": 16,
                  "font": font,
                  "anchor": "start", # equivalent of left-aligned.
                  "fontColor": "#000000"
              },
              "text": {
                  "font": font,
                  "labelFont": labelFont,
              },
              "header": {
                  "font": font,
                  "labelFont": labelFont,
                  "titleFont": font,
              },
              "axisX": {
                  "domain": False,
                  "domainColor": axisColor,
                  "labelColor": axisColor,
                  "domainWidth": 1,
                  "grid": False,
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "labelAngle": 0, 
                  "tickColor": axisColor,
                  "tickSize": 5, # default, including it just to show you can change it
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "X Axis Title (units)", 
              },
              "axisY": {
                  "domain": False,
                  "grid": True,
                  "gridColor": gridColor,
                  "gridWidth": 1,
                  "labelFont": labelFont,
                  "labelColor": axisColor,
                  "labelFontSize": 12,
                  "labelAngle": 0,
                  "labelAnchor": "end",
                  "labelAlign": "right",  
                  "ticks": False, # even if you don't have a "domain" you need to turn these off.
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "Y Axis Title (units)", 
                  # titles are by default vertical left of axis so we need to hack this 
                  "titleAngle": 0, # horizontal
                  "titleY": -10, # move it up
                  "titleX": 18, # move it to the right so it aligns with the labels 
              },
            #   "range": {
            #       "category": main_palette,
            #       "diverging": sequential_palette,
            #   },
              "legend": {
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "symbolType": "circle", # just 'cause
                  "symbolSize": 100, # default
                  "titleFont": font,
                  "titleFontSize": 12,
                  "title": "", # set it to no-title by default
                  "orient": "right", # so it's right next to the y-axis
                  "offset": 0, # literally right next to the y-axis.
              },
              "view": {
                  "stroke": "transparent", # altair uses gridlines to box the area where the data is visualized. This takes that off.
              },
        }
    }

alt.themes.register("my_custom_theme", ptc_theme)
alt.themes.enable("my_custom_theme")

# Get Data

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

req = requests.get('https://www.healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9')
soup = BeautifulSoup(req.content, 'html.parser')
script = soup.find_all('script', type='text/javascript')[6]

In [None]:
re_results = re.findall(r'"name":"Community_Profile_Report_\d*_Public.xlsx","href":"\/api\/views\/gqxm-d9w9\/files\/\w*-\w*-\w*-\w*-\w*.*?download=true', str(script))

In [None]:
cpr_url_list = ['https://www.healthdata.gov'+s.split('"href":"')[-1]\
                +'&filename=CPR_'+s.split('Community_Profile_Report_')[-1][:8]\
                +'.xlsx' for s in sorted(re_results, reverse=True)]
cpr_url_list[:10]

In [None]:
cpr_url = cpr_url_list[0]
cpr_df = pd.read_excel(cpr_url, sheet_name='Counties', header=1)
cpr_df['fips_code'] = cpr_county_df['FIPS code'].astype(str).apply(lambda x: x.zfill(5))

In [None]:
cpr_df

In [None]:
cpr_state_df = pd.read_excel(cpr_url, sheet_name='States', header=1)
cpr_state_df

In [None]:
tx_url = 'https://dshs.texas.gov/immunize/covid19/COVID-19-Vaccine-Data-by-County.xls'
tx_df = pd.read_excel(tx_url, sheet_name='By County', header=0)
tx_df

In [None]:
tx_df['County'] = tx_df['County Name'].apply(lambda x: str(x)+' County, TX')

In [None]:
tx_df.columns

In [None]:
cpr_state_df[cpr_state_df['State Abbreviation']=='TX']

In [None]:
cpr_df = pd.merge(cpr_df, tx_df[['County','People Fully Vaccinated']], how='left')
cpr_df.loc[cpr_df['State Abbreviation']=='TX', 'People who are fully vaccinated as % of total population'] = cpr_df[cpr_df['State Abbreviation']=='TX']['People Fully Vaccinated']/cpr_df[cpr_df['State Abbreviation']=='TX']['Population']

In [None]:
cpr_df[cpr_df['State Abbreviation']=='TX'][['County','People who are fully vaccinated as % of population - ages 12+']]

In [None]:
source = cpr_df[['County','fips_code','CBSA','State Abbreviation','Population',
                 'People who are fully vaccinated as % of population - ages 12+',
                 'People who are fully vaccinated as % of total population',
                 'People who are fully vaccinated as % of population - ages 65+',
                 'County type'
                 ]]

In [None]:
def calc_jitter(seed):
  np.random.seed(seed)
  return np.random.normal(scale=0.25)

In [None]:
jitter_dict = {k:calc_jitter(i) for i,k in enumerate(source['fips_code'].unique())}
jitter_dict

In [None]:
source['jitter'] = source['fips_code'].apply(lambda x: jitter_dict[x])

In [None]:
source = source[~source['State Abbreviation'].isin(['GA','VA','WV','HI','VT'])]

In [None]:
census_divisions_dict = {
'Northeast (New England)' : ['CT','ME','MA','NH','RI','VT'],
'Northeast (Mid-Atlantic)' : ['NJ','NY','PA'],
'Midwest (East North Central)' : ['IN','IL','MI','OH','WI'],
'Midwest (West North Central)' : ['IA','KS','MN','MO','NE','ND','SD'],
'South (South Atlantic)' : ['DE','MD','DC','VA','WV','NC','SC','GA','FL'],
'South (East South Central)' : ['AL','KY','MS','TN'],
'South (West South Central)' : ['AR','LA','OK','TX'],
'West (Mountain)' : ['AZ','CO','ID','NM','MT','UT','NV','WY'],
'West (Pacific)' : ['AK','CA','HI','OR','WA']
}

In [None]:
region_northeast = census_divisions_dict['Northeast (New England)'] + census_divisions_dict['Northeast (Mid-Atlantic)']
region_midwest = census_divisions_dict['Midwest (East North Central)'] + census_divisions_dict['Midwest (West North Central)']
region_south = census_divisions_dict['South (South Atlantic)'] + census_divisions_dict['South (East South Central)'] + census_divisions_dict['South (West South Central)']
region_west = census_divisions_dict['West (Mountain)'] + census_divisions_dict['West (Pacific)']

In [None]:
def assign_region(row, state_col='state'):
    if row[state_col] in region_northeast:
        return '1-Northeast'
    elif row[state_col] in region_midwest: return '3-Midwest'
    elif row[state_col] in region_south: return '2-South'
    elif row[state_col] in region_west: return '4-West'
    else: return '5-Other'
    

In [None]:
states_sorted = region_northeast+region_midwest+region_south+region_west
states_sorted

In [None]:
source['state'] = source['State Abbreviation']

In [None]:
source['region'] = source.apply(assign_region, axis=1)

In [None]:
source['Population'].hist(range=[1e5,1e6])

In [None]:
def define_popgroups(x):
  if x<2.5e5: 
    if x<1e4: return '<10K'
    else: return '10K-250K'
  elif x>1e6: return '>1M'
  else: return '250K-1M'

source['pop_group'] = source['Population'].apply(define_popgroups)

In [None]:
cpr_state_df

In [None]:
source_state = cpr_state_df[['State Abbreviation','People who are fully vaccinated as % of population - ages 12+',
                              'People who are fully vaccinated as % of total population',
                              'People who are fully vaccinated as % of population - ages 65+',


                             
                             ]]

In [None]:
source_state['CBSA'] = 'state-level'
source_state['state'] = source_state['State Abbreviation']


In [None]:
source_concat = pd.concat([source,source_state])
source_concat = source_concat[~source_concat['State Abbreviation'].isin(['PR','GU','MP','AS','VI'])]
source_concat['region'] = source_concat.apply(assign_region, axis=1)
source_concat

In [None]:
source_state['region'] = source_state.apply(assign_region, axis=1)
states_sorted = source_state.sort_values(['region','People who are fully vaccinated as % of total population'], ascending=[True,False])['state'].to_list()

In [None]:
source_state

In [None]:
# override DC county with DC state-level data
source_concat.loc[source_concat['fips_code']=='11001','People who are fully vaccinated as % of total population'] = source_state[(source_state['state'] == 'DC')]['People who are fully vaccinated as % of total population'].values[0]

In [None]:
select_states = alt.selection_multi(fields=['region'], bind='legend')
select_county = alt.selection_multi(fields=['County'])

color_range = ['#FF6B00','#34840F','#36ADA4','#7155D1','#333333']
color_domain = ['1-Northeast','2-South','3-Midwest','4-West','5-Other']

rain_viz = alt.layer(
    alt.Chart(height=550, width=20).mark_point(filled=True, opacity=0.6, stroke='black', clip=True).encode(
      y=alt.Y('People who are fully vaccinated as % of total population:Q', axis=alt.Axis(format='.0%', tickCount=7, ticks=False, domain=False, title=None), scale=alt.Scale(domain=[0.2,0.8], clamp=False)),
      x=alt.X('jitter:Q',
              title=None,
              axis=alt.Axis(values=[0], ticks=False, grid=False, domain=False, labels=False),
              scale=alt.Scale(domain=[-1,1], clamp=True)),
      color=alt.Color('region:O', scale=alt.Scale(range=color_range, domain=color_domain), legend=None),#alt.Legend(columns=1, symbolLimit=0, title=None)),
      # stroke=alt.Stroke('region', scale=alt.Scale(scheme='set1'), legend=None),#alt.Legend(columns=1, symbolLimit=0, title=None)),
      opacity=alt.condition(select_states, alt.value(0.5), alt.value(0.15)),
      
      strokeWidth=alt.condition(select_county, alt.value(0.3), alt.value(0.1)),
      strokeOpacity=alt.value(1),
      tooltip=['County','People who are fully vaccinated as % of total population','People who are fully vaccinated as % of population - ages 12+','People who are fully vaccinated as % of population - ages 65+'],
      size=alt.Size('Population', legend=alt.Legend(orient='none', direction='vertical', legendX=30, legendY=400, fillColor='white', title='County Population'), scale=alt.Scale(domain=[1e4,2.5e6], range=[15,100])),
    ),
    alt.Chart().mark_tick(orient='horizontal', size=20, thickness=2, opacity=0.7, color='black').encode(
        y=alt.Y('People who are fully vaccinated as % of total population:Q'),
        # stroke=alt.Stroke('region', scale=alt.Scale(scheme='tableau10'), legend=None),
    ).transform_filter(alt.datum.CBSA=='state-level'),
    # alt.Chart().mark_text(dy=-300, size=14, align='center').encode(
    #     #y=alt.value(0.70),
    #     color=alt.Color('region', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(columns=1, symbolLimit=0, title=None)),
    #     text='state',
    # ).transform_filter(alt.datum.CBSA=='state-level')
).facet(data=source_concat,
    column=alt.Column('State Abbreviation:O', title=None, sort=states_sorted,
                        header=alt.Header(labelAngle=0, labelAnchor='middle', labelAlign='center', labelFontSize=12, labelPadding=0)),
).configure_facet(spacing=2).configure_view(strokeWidth=0).interactive(bind_y=False).add_selection(select_states).add_selection(select_county).transform_filter(select_states
).properties(
    title=alt.TitleParams("Fully Vaccinated % of Total Population by Region, State, and County as of Sept 29, 2021", subtitle=["Sources: CDC and Texas DSHS. County-level data excludes VT, VA, GA, WV, HI and some counties",'',''])
).configure_axis(
  labelFontSize=12,
  titleFontSize=16,
  labelColor='grey',
  # gridColor='#ccc',
  gridWidth=1,
  offset=0,
).configure_header(
  titleFontSize=16,
  labelFontSize=16,
  
).configure_title(
  fontSize=16,
).configure_legend(
  titleFontSize=12,
  labelFontSize=12
)

rain_viz

# Save to Gdrive

In [None]:
# this is for saving altair charts to png and svg, based on https://colab.research.google.com/github/altair-viz/altair_saver/blob/master/AltairSaver.ipynb#scrollTo=ZiTDBCAM_Ni8
!pip install -q altair_saver
!npm install --silent vega-lite vega-cli canvas

In [None]:
from pathlib import Path
from altair_saver import save

SAVE_PATH = Path('assets')
SAVE_PATH.mkdir(exist_ok=True)

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def assets_to_gdrive(folder_name, localdir_path = SAVE_PATH, parentdir_id='17Kx2uZbQv1r5U1M9x_OXS4lpMU5c6Ym8'):
  # search gdrive for snapshot folder and save assets there if it already exists. 
  folder_id = ''
  file_list = gdrive.ListFile({'q': f"'{parentdir_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed=false"}).GetList()
  for file1 in file_list:
      if file1['title'] == folder_name: 
        folder_id = file1['id']
        print(f'Found pre-existing gdrive folder named "{folder_name}" at',folder_id)
  # if not, create new folder
  if folder_id == '':
    folder = gdrive.CreateFile(metadata={'title': folder_name,
                                      'parents':[{'id': parentdir_id}],
                                      "mimeType": "application/vnd.google-apps.folder"
                                      })
    folder.Upload()
    folder_id = folder.get('id')
    print(f'Created new gdrive folder named "{folder_name}" at',folder_id)
  
  # upload all files within SAVE_PATH to snapshot folder
  for asset_file in localdir_path.iterdir():
    file_path = localdir_path/asset_file.name
    file1 = gdrive.CreateFile(metadata={'title':asset_file.name,
                                        'parents':[{'id': folder_id}],
                                        })
    file1.SetContentFile(file_path)
    file1.Upload()
    print('Saved file: ',asset_file.name)


In [None]:
def save_vizassets(chart, save_path, filename, fmts=['html','json','png','svg','pdf']):
  for fmt in fmts:
    save(chart, f'{save_path}/{filename}.{fmt}')

In [None]:
# put your stuff (i.e. dataframes, altair charts, input data files) to save here

In [None]:
# examples:

cpr_df.to_csv(SAVE_PATH/'cpr_df.csv')
source_concat.to_csv(SAVE_PATH/'source_concat.csv')
save_vizassets(rain_viz, SAVE_PATH, f'rainviz_{now_utc_timestamp}')

In [None]:
# get the colab filename
from requests import get
nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].replace('.ipynb','')
nb_id = get('http://172.28.0.2:9000/api/sessions').json()[0]['notebook']['path'].replace('fileId=','')

print(SAVE_PATH, nb_name, now_utc_timestamp, nb_id)

# create a snapshot of this currently running notebook and save to SAVE_PATH
downloaded_nb = gdrive.CreateFile({'id':nb_id})   # replace the id with id of file you want to access
downloaded_nb.GetContentFile(SAVE_PATH/f'{nb_name}_{now_utc_timestamp}.ipynb')

In [None]:
# upload everything to gdrive
assets_to_gdrive(folder_name=f'{nb_name}_{now_utc_timestamp}')