<a href="https://colab.research.google.com/github/pandemic-tracking/viz-gen/blob/main/world_first_dose_vax_by_income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import pytz

import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

from pathlib import Path

pd.set_option("display.precision", 4)

now_est = datetime.now().astimezone(pytz.timezone("US/Eastern"))

now_est_time = now_est.strftime("%Y-%m-%d, %H:%M:%S ET")
now_est_date = now_est.strftime("%Y-%m-%d")
now_est_timestamp = now_est.strftime("%Y%m%d_%H%M%S")
now_utc_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
print(now_est_time, now_est_date, now_est_timestamp, now_utc_timestamp)

In [None]:
# adapting from https://towardsdatascience.com/consistently-beautiful-visualizations-with-altair-themes-c7f9f889602

def ptc_theme():
    axisColor = "#808080"
    gridColor = "#DEDDDD"
    markColor = "#000000"
    font = 'Arial'
    labelFont = 'Arial'
    # Colors
    # main_palette = ["#1696d2", 
    #                 "#d2d2d2",
    #                 "#000000", 
    #                 "#fdbf11", 
    #                 "#ec008b", 
    #                 "#55b748", 
    #                 "#5c5859", 
    #                 "#db2b27", 
    #                ]
    # sequential_palette = ["#cfe8f3", 
    #                       "#a2d4ec", 
    #                       "#73bfe2", 
    #                       "#46abdb", 
    #                       "#1696d2", 
    #                       "#12719e", 
    #                      ]
    return {
          "config": {
              "title": {
                  "fontSize": 16,
                  "font": font,
                  "anchor": "start", # equivalent of left-aligned.
                  "fontColor": "#000000"
              },
              "text": {
                  "font": font,
                  "labelFont": labelFont,
              },
              "header": {
                  "font": font,
                  "labelFont": labelFont,
                  "titleFont": font,
              },
              "axisX": {
                  "domain": False,
                  "domainColor": axisColor,
                  "labelColor": axisColor,
                  "domainWidth": 1,
                  "grid": False,
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "labelAngle": 0, 
                  "tickColor": axisColor,
                  "tickSize": 5, # default, including it just to show you can change it
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "X Axis Title (units)", 
              },
              "axisY": {
                  "domain": False,
                  "grid": True,
                  "gridColor": gridColor,
                  "gridWidth": 1,
                  "labelFont": labelFont,
                  "labelColor": axisColor,
                  "labelFontSize": 12,
                  "labelAngle": 0,
                  "labelAnchor": "end",
                  "labelAlign": "right",  
                  "ticks": False, # even if you don't have a "domain" you need to turn these off.
                  "titleFont": font,
                  "titleFontSize": 12,
                  "titlePadding": 10, # guessing, not specified in styleguide
                  "title": "Y Axis Title (units)", 
                  # titles are by default vertical left of axis so we need to hack this 
                  "titleAngle": 0, # horizontal
                  "titleY": -10, # move it up
                  "titleX": 18, # move it to the right so it aligns with the labels 
              },
            #   "range": {
            #       "category": main_palette,
            #       "diverging": sequential_palette,
            #   },
              "legend": {
                  "labelFont": labelFont,
                  "labelFontSize": 12,
                  "symbolType": "circle", # just 'cause
                  "symbolSize": 100, # default
                  "titleFont": font,
                  "titleFontSize": 12,
                  "title": "", # set it to no-title by default
                  "orient": "right", # so it's right next to the y-axis
                  "offset": 0, # literally right next to the y-axis.
              },
              "view": {
                  "stroke": "transparent", # altair uses gridlines to box the area where the data is visualized. This takes that off.
              },
        }
    }

alt.themes.register("my_custom_theme", ptc_theme)
alt.themes.enable("my_custom_theme")

## OWID Vaccination and World Bank Income Classification Data

In [None]:
# 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv'
owid_vax_df = pd.read_csv('/content/owid_vax_df_20211216-12EST.csv') 
owid_vax_df

In [None]:
# Population data by country, region, and for the world, was constructed by Our World in Data based on three key sources:
# — HYDE Version 3.2 from the year -10,000 to 1799;
# — Gapminder Version 6 from 1800 to 1949;
# — UN World Population Prospects (2019) estimates from 1950 to 2021;
# — UN World Population Prospects (2019) projections (medium variant scenario) from 2022 to 2100.

# https://ourworldindata.org/grapher/covid-people-vaccinated-marimekko
owid_df = pd.read_csv('/content/covid-people-vaccinated-marimekko-20211216-12EST.csv')
owid_df.columns

In [None]:
pop_data = owid_df[['Entity','Share of world population']].drop_duplicates()

In [None]:
owid_pop_df = owid_vax_df.merge(pop_data, how='left',left_on='location',right_on='Entity')

In [None]:
owid_pop_df[owid_pop_df['Share of world population'].isna()]['location'].unique()

In [None]:
# https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
wb_df = pd.read_excel('/content/CLASS.xlsx')

In [None]:
owid_wb_mismatches = {'Bahamas, The': 'Bahamas',
 'Brunei Darussalam': 'Brunei',
 'Cabo Verde': 'Cape Verde',
 'Congo, Dem. Rep.': 'Democratic Republic of Congo',
 'Congo, Rep.': 'Congo',
 'Curaçao': 'Curacao',
 'Czech Republic': 'Czechia',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Egypt, Arab Rep.': 'Egypt',
 'Faroe Islands': 'Faeroe Islands',
 'Gambia, The': 'Gambia',
 'Hong Kong SAR, China': 'Hong Kong',
 'Iran, Islamic Rep.': 'Iran',
 'Korea, Rep.': 'South Korea',
 'Kyrgyz Republic': 'Kyrgyzstan',
 'Lao PDR': 'Laos',
 'Macao SAR, China': 'Macao',
 'Russian Federation': 'Russia',
 'Slovak Republic': 'Slovakia',
 'St. Kitts and Nevis': 'Saint Kitts and Nevis',
 'St. Lucia': 'Saint Lucia',
 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
 'Syrian Arab Republic': 'Syria',
 'São Tomé and Principe': 'Sao Tome and Principe',
 'Taiwan, China': 'Taiwan',
 'Timor-Leste': 'Timor',
 'Venezuela, RB': 'Venezuela',
 'West Bank and Gaza': 'Palestine',
 'Yemen, Rep.': 'Yemen'}
wb_owid_df = wb_df.replace(owid_wb_mismatches)

In [None]:
owid_income_df = owid_pop_df.merge(wb_owid_df,how='left',left_on='location',right_on='Economy')
# these fell out - may be things to tackle in the renaming
# venezuela has no income group
owid_income_df[owid_income_df['Economy'].isna()]['location'].unique()

In [None]:
len(owid_income_df), pd.to_datetime(owid_income_df['date']).min(), pd.to_datetime(owid_income_df['date']).max()

In [None]:
df = owid_income_df.sort_values(by=['location','date'])
# make sure the vax data is there
df = df[~(df['people_vaccinated_per_hundred'].isna()) & ~(df['people_fully_vaccinated_per_hundred'].isna())]
# keep only most recent vax data for each country
df = df.drop_duplicates(subset='location',keep='last')
df

In [None]:
# remove continent data rows
continents = ['Africa','Antarctica','Asia','Europe','North America', 'Oceania', 'South America', 'World']
df = df[~df['location'].isin(continents)]

In [None]:
pd.to_datetime(df.date).describe(datetime_is_numeric=True)

In [None]:
# keep only countries with vax and pop data
df = df[~(df['Share of world population'].isna()) & ~(df['people_vaccinated_per_hundred'].isna())]

In [None]:
df[df['Income group'].isna()].sort_values('Share of world population', ascending=False)

## Charts

### OWID x WB chart with text labels

In [None]:
df = df.sort_values(by='people_vaccinated_per_hundred', ascending=True)

In [None]:
df = df.rename(columns={'Share of world population':'share_of_world_population'})

In [None]:
# to dictate bar widths
df['y1'] = df['share_of_world_population'].cumsum()
df['y0'] = df['y1'].shift(fill_value=0)

In [None]:
df[['Income group']] = df[['Income group']].fillna(value='No income data')

In [None]:
df['Income group'].unique()

In [None]:
domain = ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']
range_ = ['#e66101','#fdb863', '#9D91D2', '#5e3c99','darkgrey']

In [None]:
display_threshold_ = 1.5

In [None]:
# cap height at the total of all locations shown
y_height = df['share_of_world_population'].sum()

vax_total = alt.Chart(df).encode(
    x=alt.X('people_vaccinated_per_hundred:Q', title='% of Population with at Least One Dose',scale=alt.Scale(domain=(0,100),clamp=True)),
    y=alt.Y('y0:Q', title='Share of World Population in 2021', scale=alt.Scale(domain=(0,y_height)), axis=alt.Axis(values=[0,20,40,60,80,y_height])),
    y2='y1',
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["location", "date",alt.Tooltip("people_vaccinated_per_hundred",title='% with at least one dose'), alt.Tooltip('share_of_world_population',title='% share of world population'), "Income group"]
)

vax_total_filtered = alt.Chart(df).encode(
    x=alt.X('people_vaccinated_per_hundred:Q', title='% of Population with at Least One Dose',scale=alt.Scale(domain=(0,100),clamp=True)),
    y=alt.Y('y0:Q', title='Share of World Population in 2021', scale=alt.Scale(domain=(0,y_height))),
    y2='y1',
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["location", "date",alt.Tooltip("people_vaccinated_per_hundred",title='% with at least one dose'), alt.Tooltip('share_of_world_population',title='% share of world population'), "Income group"]
).transform_filter(
    (alt.datum.share_of_world_population >=display_threshold_) | (alt.datum.location == 'South Africa')
)

viz = (vax_total.mark_rect(stroke='black', strokeWidth=0.15) + vax_total_filtered.mark_text(dx=2, dy=5, size=15, align='left', baseline='bottom').encode(
    text= alt.Text('location')
).properties(
    width=1000,
    height=550,
    title=f'Population With First Dose by Country and Income Group'
)).configure_axisX(domain=True,grid=True,tickCount=10,titleColor='grey').configure_axisY(titleColor='grey')
viz

# Save to Drive

In [None]:
# this is for saving altair charts to png and svg, based on https://colab.research.google.com/github/altair-viz/altair_saver/blob/master/AltairSaver.ipynb#scrollTo=ZiTDBCAM_Ni8
!pip install -q altair_saver
!npm install --silent vega-lite vega-cli canvas

In [None]:
from pathlib import Path
from altair_saver import save

SAVE_PATH = Path('assets')
SAVE_PATH.mkdir(exist_ok=True)

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

In [None]:
def assets_to_gdrive(folder_name, localdir_path = SAVE_PATH, parentdir_id='17Kx2uZbQv1r5U1M9x_OXS4lpMU5c6Ym8'):
  # search gdrive for snapshot folder and save assets there if it already exists. 
  folder_id = ''
  file_list = gdrive.ListFile({'q': f"'{parentdir_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed=false"}).GetList()
  for file1 in file_list:
      if file1['title'] == folder_name: 
        folder_id = file1['id']
        print(f'Found pre-existing gdrive folder named "{folder_name}" at',folder_id)
  # if not, create new folder
  if folder_id == '':
    folder = gdrive.CreateFile(metadata={'title': folder_name,
                                      'parents':[{'id': parentdir_id}],
                                      "mimeType": "application/vnd.google-apps.folder"
                                      })
    folder.Upload()
    folder_id = folder.get('id')
    print(f'Created new gdrive folder named "{folder_name}" at',folder_id)
  
  # upload all files within SAVE_PATH to snapshot folder
  for asset_file in localdir_path.iterdir():
    file_path = localdir_path/asset_file.name
    file1 = gdrive.CreateFile(metadata={'title':asset_file.name,
                                        'parents':[{'id': folder_id}],
                                        })
    file1.SetContentFile(file_path)
    file1.Upload()
    print('Saved file: ',asset_file.name)


In [None]:
def save_vizassets(chart, save_path, filename, fmts=['html','json','png','svg','pdf']):
  for fmt in fmts:
    save(chart, f'{save_path}/{filename}.{fmt}')

In [None]:
# put your stuff (i.e. dataframes, altair charts, input data files) to save here

In [None]:
wb_df.to_csv(SAVE_PATH/'world_bank_df.csv')
owid_vax_df.to_csv(SAVE_PATH/'owid_vax_df_20211216-12EST.csv')
owid_df.to_csv(SAVE_PATH/'covid-people-vaccinated-marimekko-20211216-12EST.csv')
df.to_csv(SAVE_PATH/'df.csv')
save_vizassets(viz, SAVE_PATH, f'pop_with_first_dose_by_country_and_income_{now_utc_timestamp}')

In [None]:
# get the colab filename
from requests import get
nb_name = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].replace('.ipynb','')
nb_id = get('http://172.28.0.2:9000/api/sessions').json()[0]['notebook']['path'].replace('fileId=','')

print(SAVE_PATH, nb_name, now_utc_timestamp, nb_id)

# create a snapshot of this currently running notebook and save to SAVE_PATH
downloaded_nb = gdrive.CreateFile({'id':nb_id})   # replace the id with id of file you want to access
downloaded_nb.GetContentFile(SAVE_PATH/f'{nb_name}_{now_utc_timestamp}.ipynb')

In [None]:
# upload everything to gdrive
assets_to_gdrive(folder_name=f'{nb_name}_{now_utc_timestamp}')