<a href="https://colab.research.google.com/github/pandemic-tracking/viz-gen/blob/main/variant_proportions_flourish_viz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import pytz

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from pathlib import Path

pd.set_option("display.precision", 4)

now_est = datetime.now().astimezone(pytz.timezone("US/Eastern"))

now_est_time = now_est.strftime("%Y-%m-%d, %H:%M:%S ET")
now_est_date = now_est.strftime("%Y-%m-%d")
now_est_timestamp = now_est.strftime("%Y%m%d_%H%M%S")
now_utc_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
print(now_est_time, now_est_date, now_est_timestamp, now_utc_timestamp)

# Variant Proportions Flourish Visualization

Inputs: 

*   ALLWEEKS_PCTVOC.txt, a file of variant proportions from Eric Rouchka at UofL
*   catchment_zone_numbers.csv, a file of catchment areas and zone numbers

Outputs:
*   voc_flourish.csv, a timeseries of variant proportions by catchment area to be plugged into the [flourish viz stacked bar chart](https://public.flourish.studio/visualisation/8658207) to be used by the [flourish map of variant proportions by catchment area](https://public.flourish.studio/story/1131027/)


In [None]:
df = pd.read_csv('/content/ALLWEEKS_PCTVOC.txt',sep='\t')

In [None]:
# if weeks 44 and 45 are still missing, run this manual patch
missing_weeks = {'34th Street PS': [1.0,1.0],
 'CCWQTC INF': [1.0,1.0],
 'DRGWQTC INF': [1.0,1.0],
 'FFWQTP INF': [1.0,1.0],
 'HCWQTP INF': [1.0,1.0],
 'MFWQTC': [1.0,1.0],
 'MH08915A CSO140': [1.0,1.0],
 'MH09837 Ashby Lane & Mill Creek': [1.0,1.0],
 'MH23290 W. Indian Trail': [1.0,1.0],
 'MH32985 Wood Road & Terry Road': [1.0,1.0],
 'MH40870 Muddy Forks PS': [1.0,1.0],
 'MH50495 CSO108': [1.0,1.0],
 'MH57350 Preston & South Park': [1.0,1.0],
 'MH57769 Pineland Drive & Oakmont Dr.': [1.0,1.0],
 'MH70101 15th & Wilson': [1.0,1.0],
 'MH71910 CSO146': [1.0,1.0],
 'Shawnee Park B': [1.0,1.0],
 'VOC/VOI/VUM': ['MISSING DATA_WK44','MISSING DATA_WK45']}
df = pd.concat([df,pd.DataFrame(missing_weeks)]).reset_index().drop(columns='index')

In [None]:
df[['voc','week']] = df['VOC/VOI/VUM'].str.split('_WK', expand=True)

In [None]:
df.week.unique()

In [None]:
# I looked at the individual catchment areas files and found that the dates are Tuesdays until week 11, after which they are Mondays
def assign_date(record):
  week = int(record['week'])
  if week < 11: #it's a tuesday
    record['date'] = pd.to_datetime(pd.Timestamp('2021-02-16') + pd.Timedelta(week, 'W'))
  else: #it's a monday
    record['date'] = pd.to_datetime(pd.Timestamp('2021-02-15') + pd.Timedelta(week, 'W'))
  return record

In [None]:
df = df.apply(assign_date, axis=1)

In [None]:
df.sort_values(by='date',inplace=True)

In [None]:
melt_df = df.drop(['VOC/VOI/VUM','week'],axis=1).melt(id_vars=['voc','date']) # week for stacked chart

In [None]:
melt_df.rename(columns={'variable':'location'}, inplace=True)

In [None]:
flourish_df = melt_df.pivot(index=['date','location'],columns='voc').reset_index()

In [None]:
flourish_df.columns = flourish_df.columns.droplevel()

In [None]:
flourish_df.columns = ['date', 'location', 'Alpha', 'Beta', 'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota',
       'Kappa', 'Lambda', 'MISSING DATA', 'Mu', 'OTHER', 'Omicron', 'Theta',
       'Zeta']

In [None]:
site_rename_dict = {'34th Street PS':'34th Street', 'CCWQTC INF':'Cedar Creek Treatment Facility', 'DRGWQTC INF':'Guthrie Treatment Facility', 'FFWQTP INF':'Floyds Fork Treatment Facility',
       'HCWQTP INF':'Hite Creek Treatment Facility', 'MFWQTC':'Morris Forman Treatment Facility', 'MH08915A CSO140':'Locust & Lobdell',
       'MH09837 Ashby Lane & Mill Creek':'Ashby Lane & Mill Creek', 'MH23290 W. Indian Trail':'W. Indian Trail',
       'MH32985 Wood Road & Terry Road':'Wood Road & Terry Road', 'MH40870 Muddy Forks PS':'Muddy Forks',
       'MH50495 CSO108':'Newburg Rd', 'MH57350 Preston & South Park':'Preston & South Park',
       'MH57769 Pineland Drive & Oakmont Dr.':'Pineland & Oakmont', 'MH70101 15th & Wilson':'15th & Wilson',
       'MH71910 CSO146':'KY & Swan', 'Shawnee Park B':'Shawnee Park'}

In [None]:
flourish_df.replace(site_rename_dict, inplace=True)

In [None]:
catchment_order = pd.read_csv('/content/catchment_zone_numbers.csv')

In [None]:
flourish_df = flourish_df.merge(catchment_order,left_on='location', right_on='Address_2',how='left')

In [None]:
flourish_df.sort_values(by=['date','ZoneNUM'], inplace=True)

In [None]:
flourish_df[['date', 'location', 'Alpha', 'Beta', 'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota',
       'Kappa', 'Lambda',  'Mu', 'Omicron', 'Theta',
       'Zeta', 'OTHER', 'MISSING DATA']].to_csv('voc_flourish.csv')

# Save to Drive

In [None]:
# this is for saving altair charts to png and svg, based on https://colab.research.google.com/github/altair-viz/altair_saver/blob/master/AltairSaver.ipynb#scrollTo=ZiTDBCAM_Ni8
!pip install -q altair_saver
!npm install --silent vega-lite vega-cli canvas

In [None]:
from pathlib import Path
from altair_saver import save

SAVE_PATH = Path('assets')
SAVE_PATH.mkdir(exist_ok=True)

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def assets_to_gdrive(folder_name, localdir_path = SAVE_PATH, parentdir_id='17Kx2uZbQv1r5U1M9x_OXS4lpMU5c6Ym8'):
  # search gdrive for snapshot folder and save assets there if it already exists. 
  folder_id = ''
  file_list = gdrive.ListFile({'q': f"'{parentdir_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed=false"}).GetList()
  for file1 in file_list:
      if file1['title'] == folder_name: 
        folder_id = file1['id']
        print(f'Found pre-existing gdrive folder named "{folder_name}" at',folder_id)
  # if not, create new folder
  if folder_id == '':
    folder = gdrive.CreateFile(metadata={'title': folder_name,
                                      'parents':[{'id': parentdir_id}],
                                      "mimeType": "application/vnd.google-apps.folder"
                                      })
    folder.Upload()
    folder_id = folder.get('id')
    print(f'Created new gdrive folder named "{folder_name}" at',folder_id)
  
  # upload all files within SAVE_PATH to snapshot folder
  for asset_file in localdir_path.iterdir():
    file_path = localdir_path/asset_file.name
    file1 = gdrive.CreateFile(metadata={'title':asset_file.name,
                                        'parents':[{'id': folder_id}],
                                        })
    file1.SetContentFile(file_path)
    file1.Upload()
    print('Saved file: ',asset_file.name)


In [None]:
def save_vizassets(chart, save_path, filename, fmts=['html','json','png','svg','pdf']):
  for fmt in fmts:
    save(chart, f'{save_path}/{filename}.{fmt}')

In [None]:
# put your stuff (i.e. dataframes, altair charts, input data files) to save here

In [None]:
df.to_csv(SAVE_PATH/'ALLWEEKS_PCTVOC.csv')
flourish_df.to_csv(SAVE_PATH/'voc_flourish.csv')
catchment_order.to_csv(SAVE_PATH/'catchment_zone_numbers.csv')

In [None]:
# get the colab filename
from requests import get
nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].replace('.ipynb','')
nb_id = get('http://172.28.0.2:9000/api/sessions').json()[0]['notebook']['path'].replace('fileId=','')

print(SAVE_PATH, nb_name, now_utc_timestamp, nb_id)

# create a snapshot of this currently running notebook and save to SAVE_PATH
downloaded_nb = gdrive.CreateFile({'id':nb_id})   # replace the id with id of file you want to access
downloaded_nb.GetContentFile(SAVE_PATH/f'{nb_name}_{now_utc_timestamp}.ipynb')

In [None]:
# upload everything to gdrive
assets_to_gdrive(folder_name=f'{nb_name}_{now_utc_timestamp}')