<a href="https://colab.research.google.com/github/pandemic-tracking/viz-gen/blob/main/gisaid_submission_stats_world_JAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import pytz

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from pathlib import Path

pd.set_option("display.precision", 4)

now_est = datetime.now().astimezone(pytz.timezone("US/Eastern"))

now_est_time = now_est.strftime("%Y-%m-%d, %H:%M:%S ET")
now_est_date = now_est.strftime("%Y-%m-%d")
now_est_timestamp = now_est.strftime("%Y%m%d_%H%M%S")
now_utc_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
print(now_est_time, now_est_date, now_est_timestamp, now_utc_timestamp)

In [None]:
%load_ext google.colab.data_table

In [None]:
# adapting from https://towardsdatascience.com/consistently-beautiful-visualizations-with-altair-themes-c7f9f889602

def ptc_theme():
    axisColor = "#808080"
    gridColor = "#DEDDDD"
    markColor = "#000000"
    font = 'Arial'
    labelFont = 'Arial'
    # Colors
    # main_palette = ["#1696d2", 
    #                 "#d2d2d2",
    #                 "#000000", 
    #                 "#fdbf11", 
    #                 "#ec008b", 
    #                 "#55b748", 
    #                 "#5c5859", 
    #                 "#db2b27", 
    #                ]
    # sequential_palette = ["#cfe8f3", 
    #                       "#a2d4ec", 
    #                       "#73bfe2", 
    #                       "#46abdb", 
    #                       "#1696d2", 
    #                       "#12719e", 
    #                      ]
    return {
          "config": {
              "title": {
                  "fontSize": 16,
                  "font": font,
                  "anchor": "start", # equivalent of left-aligned.
                  "fontColor": "#000000"
              },
              "text": {
                  "font": font,
                  "labelFont": labelFont,
              },
              "header": {
                  "font": font,
                  "labelFont": labelFont,
                  "titleFont": font,
              },
              "axisX": {
                  "domain": False,
                  "domainColor": axisColor,
                  "labelColor": axisColor,
                  "domainWidth": 1,
                  "grid": False,
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "labelAngle": 0, 
                  "tickColor": axisColor,
                  "tickSize": 5, # default, including it just to show you can change it
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "X Axis Title (units)", 
              },
              "axisY": {
                  "domain": False,
                  "grid": True,
                  "gridColor": gridColor,
                  "gridWidth": 1,
                  "labelFont": labelFont,
                  "labelColor": axisColor,
                  "labelFontSize": 12,
                  "labelAngle": 0,
                  "labelAnchor": "end",
                  "labelAlign": "right",  
                  "ticks": False, # even if you don't have a "domain" you need to turn these off.
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "Y Axis Title (units)", 
                  # titles are by default vertical left of axis so we need to hack this 
                  "titleAngle": 0, # horizontal
                  "titleY": -10, # move it up
                  "titleX": 18, # move it to the right so it aligns with the labels 
              },
            #   "range": {
            #       "category": main_palette,
            #       "diverging": sequential_palette,
            #   },
              "legend": {
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "symbolType": "circle", # just 'cause
                  "symbolSize": 100, # default
                  "titleFont": font,
                  "titleFontSize": 12,
                  "title": "", # set it to no-title by default
                  "orient": "right", # so it's right next to the y-axis
                  "offset": 0, # literally right next to the y-axis.
              },
              "view": {
                  "stroke": "transparent", # altair uses gridlines to box the area where the data is visualized. This takes that off.
              },
        }
    }

alt.themes.register("my_custom_theme", ptc_theme)
alt.themes.enable("my_custom_theme")

## OWID Vaccination and World Bank Income Classification Data

In [None]:
# https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
wb_df = pd.read_excel('/content/CLASS.xlsx')

In [None]:
owid_wb_mismatches = {'Bahamas, The': 'Bahamas',
 'Brunei Darussalam': 'Brunei',
 'Cabo Verde': 'Cape Verde',
 'Congo, Dem. Rep.': 'Democratic Republic of Congo',
 'Congo, Rep.': 'Congo',
 'Curaçao': 'Curacao',
 'Czech Republic': 'Czechia',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Egypt, Arab Rep.': 'Egypt',
 'Faroe Islands': 'Faeroe Islands',
 'Gambia, The': 'Gambia',
 'Hong Kong SAR, China': 'Hong Kong',
 'Iran, Islamic Rep.': 'Iran',
 'Korea, Rep.': 'South Korea',
 'Kyrgyz Republic': 'Kyrgyzstan',
 'Lao PDR': 'Laos',
 'Macao SAR, China': 'Macao',
 'Russian Federation': 'Russia',
 'Slovak Republic': 'Slovakia',
 'St. Kitts and Nevis': 'Saint Kitts and Nevis',
 'St. Lucia': 'Saint Lucia',
 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
 'Syrian Arab Republic': 'Syria',
 'São Tomé and Principe': 'Sao Tome and Principe',
 'Taiwan, China': 'Taiwan',
 'Timor-Leste': 'Timor',
 'Venezuela, RB': 'Venezuela',
 'West Bank and Gaza': 'Palestine',
 'Yemen, Rep.': 'Yemen'}
wb_owid_df = wb_df.replace(owid_wb_mismatches)

## GISAID

In [None]:
# https://www.gisaid.org/index.php?id=208 (I just copied the data from the page here into a workbook and uploaded it)
pre_gisaid_df = pd.read_excel('/content/gisaid_country_180day_20211216-12EST.xlsx')

In [None]:
pre_gisaid_df.rename(columns={'Country / territory':'Country'}, inplace=True)

### cleanup

In [None]:
owid_gisaid_mismatches = {'Cabo Verde': 'Cape Verde',
 'Curaçao': 'Curacao',
 'Czech Republic': 'Czechia',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Democratic Republic of the Congo': 'Democratic Republic of Congo',
 'Falkland Islands (Malvinas)': 'Falkland Islands',
 'Faroe Islands': 'Faeroe Islands',
 'Holy See': 'Vatican',
 'Micronesia': 'Micronesia (country)',
 'Pitcairn Islands': 'Pitcairn',
 'Republic of the Congo': 'Congo',
 'Réunion': 'Reunion',
 'Saint Martin': 'Saint Martin (French part)',
 'Sint Maarten': 'Sint Maarten (Dutch part)',
 'The Bahamas': 'Bahamas',
 'Timor-Leste': 'Timor',
 'United States of America': 'United States',
 'U.S. Virgin Islands': 'United States Virgin Islands',
 'Wallis and Futuna Islands': 'Wallis and Futuna',
 'West Bank and Gaza': 'Palestine',
 'eSwatini': 'Eswatini'}
pre_gisaid_df = pre_gisaid_df[~(pre_gisaid_df.Country == 'Slovak Republic')]
pre_gisaid_df = pre_gisaid_df.replace(owid_gisaid_mismatches)

there's both a US Virgin Islands and a United States Virgin Islands that should be collapsed (they have complementary data), ditto Bahamas

In [None]:
pre_gisaid_df.Country.value_counts()

In [None]:
gisaid_grouped_df = pre_gisaid_df.groupby(by='Country').sum()
gisaid_grouped_df.reset_index(inplace=True)
gisaid_grouped_df

notes on gisaid data


*   changing many names to reflect OWID nomenclature



In [None]:
gisaid_df = gisaid_grouped_df.copy()

In [None]:
# checking to make sure there aren't any duplicates
gisaid_df.Country.value_counts()

### GISAID x WB

In [None]:
owid_gisaid_wb_df = gisaid_df.merge(wb_owid_df,how='left',left_on='Country',right_on='Economy')
owid_gisaid_wb_df['Income group'] = owid_gisaid_wb_df['Income group'].fillna(value='No income data')
# these fell out - may be things to tackle in the renaming
# venezuela has no income group
owid_gisaid_wb_df[owid_gisaid_wb_df['Economy'].isna()]['Country'].unique()

## Charts

### country sequencing (thanks, Dave!)

In [None]:
date = '12/16/2021'
seq_thresh = 2000

In [None]:
df = owid_gisaid_wb_df.copy()

In [None]:
df.columns

In [None]:
exclude = ['Puerto Rico', 'Reunion']

In [None]:
df = df[~(df.Country.isin(exclude))]

In [None]:
df = df[df['Sequences shared'] > seq_thresh]

In [None]:
len(df)

In [None]:
df.rename(columns={'% of cases sequenced and shared':'percent_cases_sequenced_and_shared'}, inplace=True)

In [None]:
sort_countries = df.sort_values(by='percent_cases_sequenced_and_shared', ascending=False)['Country'].to_list()

In [None]:
domain = ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']
range_ = ['#e66101','#fdb863', '#9D91D2', '#5e3c99','darkgrey']

In [None]:
base = alt.Chart(df).mark_bar(orient='vertical').encode(
    y = alt.Y('Country', sort=sort_countries, axis=None),
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
)

base_text = base.mark_text(dx=5, dy=1, size=14, align='left', baseline='middle').encode(color = alt.value('#111'))

gisaid1 = alt.layer(
    base.encode(
      x = alt.X('percent_cases_sequenced_and_shared', axis=None)
    )
    +
    base_text.encode(
      x = alt.X('percent_cases_sequenced_and_shared'),
      text = alt.Text('percent_cases_sequenced_and_shared', format=',.1f'),
    )
).properties(title='% of Cases Sequenced and Shared', height=alt.Step(11), width=300)

gisaid2 = alt.layer(
    base.encode(
      x = alt.X('Sequences shared', axis=None, sort=alt.SortOrder('descending')),
      color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    )
    +
    base_text.mark_text(dx=-5, dy=1, size=12, align='right').encode(
      x = 'Sequences shared',
      text = alt.Text('Sequences shared', format=',.0f'), 
    )
).properties(title='Sequences shared', height=alt.Step(11), width=300)

gisaid3 = alt.layer(
    base.encode(
      x = alt.X('Median days to deposition', axis=None),
    )
    +
    base_text.encode(
      x = 'Median days to deposition',
      text = alt.Text('Median days to deposition', format='.0f'),
    )
).properties(title='Median days to deposition', height=alt.Step(11), width=160)

middle = base.encode(
    text=alt.Text('Country:O'),
).mark_text(size=14).properties(height=alt.Step(11), width=10)


In [None]:
viz = alt.vconcat(gisaid2|middle|gisaid1|middle|gisaid3).properties(title=f'Sequences Submitted from Countries to GISAID in Last 180 Days as of ' + date).configure_axisY(grid=False, domain=False, ticks=False)
viz

# Save to Drive

In [None]:
# this is for saving altair charts to png and svg, based on https://colab.research.google.com/github/altair-viz/altair_saver/blob/master/AltairSaver.ipynb#scrollTo=ZiTDBCAM_Ni8
!pip install -q altair_saver
!npm install --silent vega-lite vega-cli canvas

In [None]:
from pathlib import Path
from altair_saver import save

SAVE_PATH = Path('assets')
SAVE_PATH.mkdir(exist_ok=True)

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def assets_to_gdrive(folder_name, localdir_path = SAVE_PATH, parentdir_id='17Kx2uZbQv1r5U1M9x_OXS4lpMU5c6Ym8'):
  # search gdrive for snapshot folder and save assets there if it already exists. 
  folder_id = ''
  file_list = gdrive.ListFile({'q': f"'{parentdir_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed=false"}).GetList()
  for file1 in file_list:
      if file1['title'] == folder_name: 
        folder_id = file1['id']
        print(f'Found pre-existing gdrive folder named "{folder_name}" at',folder_id)
  # if not, create new folder
  if folder_id == '':
    folder = gdrive.CreateFile(metadata={'title': folder_name,
                                      'parents':[{'id': parentdir_id}],
                                      "mimeType": "application/vnd.google-apps.folder"
                                      })
    folder.Upload()
    folder_id = folder.get('id')
    print(f'Created new gdrive folder named "{folder_name}" at',folder_id)
  
  # upload all files within SAVE_PATH to snapshot folder
  for asset_file in localdir_path.iterdir():
    file_path = localdir_path/asset_file.name
    file1 = gdrive.CreateFile(metadata={'title':asset_file.name,
                                        'parents':[{'id': folder_id}],
                                        })
    file1.SetContentFile(file_path)
    file1.Upload()
    print('Saved file: ',asset_file.name)


In [None]:
def save_vizassets(chart, save_path, filename, fmts=['html','json','png','svg','pdf']):
  for fmt in fmts:
    save(chart, f'{save_path}/{filename}.{fmt}')

In [None]:
# put your stuff (i.e. dataframes, altair charts, input data files) to save here

In [None]:
wb_df.to_csv(SAVE_PATH/'world_bank_df.csv')
pre_gisaid_df.to_csv(SAVE_PATH/'gisaid_country_180day_20211216-12EST.csv')
df.to_csv(SAVE_PATH/'df.csv')
save_vizassets(viz, SAVE_PATH, f'gisaid_world_stats_{now_utc_timestamp}')

In [None]:
# get the colab filename
from requests import get
nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].replace('.ipynb','')
nb_id = get('http://172.28.0.2:9000/api/sessions').json()[0]['notebook']['path'].replace('fileId=','')

print(SAVE_PATH, nb_name, now_utc_timestamp, nb_id)

# create a snapshot of this currently running notebook and save to SAVE_PATH
downloaded_nb = gdrive.CreateFile({'id':nb_id})   # replace the id with id of file you want to access
downloaded_nb.GetContentFile(SAVE_PATH/f'{nb_name}_{now_utc_timestamp}.ipynb')

In [None]:
# upload everything to gdrive
assets_to_gdrive(folder_name=f'{nb_name}_{now_utc_timestamp}')