<a href="https://colab.research.google.com/github/pandemic-tracking/viz-gen/blob/main/country_vaccinations_income_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import altair as alt

In [None]:
%load_ext google.colab.data_table

## OWID Vaccination and World Bank Income Classification Data

In [None]:
# https://ourworldindata.org/grapher/covid-people-vaccinated-marimekko
owid_df = pd.read_csv('/content/covid-people-vaccinated-marimekko.csv')

In [None]:
# https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
wb_df = pd.read_excel('/content/CLASS.xlsx')

In [None]:
owid_wb_mismatches = {'Bahamas, The': 'Bahamas',
 'Brunei Darussalam': 'Brunei',
 'Cabo Verde': 'Cape Verde',
 'Congo, Dem. Rep.': 'Democratic Republic of Congo',
 'Congo, Rep.': 'Congo',
 'Curaçao': 'Curacao',
 'Czech Republic': 'Czechia',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Egypt, Arab Rep.': 'Egypt',
 'Faroe Islands': 'Faeroe Islands',
 'Gambia, The': 'Gambia',
 'Hong Kong SAR, China': 'Hong Kong',
 'Iran, Islamic Rep.': 'Iran',
 'Korea, Rep.': 'South Korea',
 'Kyrgyz Republic': 'Kyrgyzstan',
 'Lao PDR': 'Laos',
 'Macao SAR, China': 'Macao',
 'Russian Federation': 'Russia',
 'Slovak Republic': 'Slovakia',
 'St. Kitts and Nevis': 'Saint Kitts and Nevis',
 'St. Lucia': 'Saint Lucia',
 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
 'Syrian Arab Republic': 'Syria',
 'São Tomé and Principe': 'Sao Tome and Principe',
 'Taiwan, China': 'Taiwan',
 'Timor-Leste': 'Timor',
 'Venezuela, RB': 'Venezuela',
 'West Bank and Gaza': 'Palestine',
 'Yemen, Rep.': 'Yemen'}
wb_owid_df = wb_df.replace(owid_wb_mismatches)

## GISAID

In [None]:
# https://www.gisaid.org/index.php?id=208 (I just copied the data from the page here into a workbook and uploaded it)
gisaid_df = pd.read_excel('/content/GISAID_cases_sequenced_by_country_20211015.xlsx')

notes on gisaid data


*   there's both a US Virgin Islands and a United States Virgin Islands that should be collapsed (they have complimentary data)
*   changing many names to reflect OWID nomenclature



In [None]:
owid_gisaid_mismatches = {'Cabo Verde': 'Cape Verde',
 'Curaçao': 'Curacao',
 'Czech Republic': 'Czechia',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Democratic Republic of the Congo': 'Democratic Republic of Congo',
 'Falkland Islands (Malvinas)': 'Falkland Islands',
 'Faroe Islands': 'Faeroe Islands',
 'Holy See': 'Vatican',
 'Micronesia': 'Micronesia (country)',
 'Pitcairn Islands': 'Pitcairn',
 'Republic of the Congo': 'Congo',
 'Réunion': 'Reunion',
 'Saint Martin': 'Saint Martin (French part)',
 'Sint Maarten': 'Sint Maarten (Dutch part)',
 'Slovak Republic': 'Slovakia',
 'The Bahamas': 'Bahamas',
 'Timor-Leste': 'Timor',
 'United States of America': 'United States',
 'Wallis and Futuna Islands': 'Wallis and Futuna',
 'West Bank and Gaza': 'Palestine',
 'eSwatini': 'Eswatini'}
gisaid_owid_df = gisaid_df.replace(owid_gisaid_mismatches)

In [None]:
owid_gisaid_df = owid_df.merge(gisaid_owid_df, how='outer', left_on='Entity', right_on='Country')
owid_gisaid_df[owid_gisaid_df['Entity'].isna()]['Country'].unique()

# in gisaid but not in owid
# bonaire_df = gisaid_df[gisaid_df['Country'].isin(['Bonaire', 'Saba', 'Sint Eustatius'])]
# bonaire_df
# gisaid treats as 3 separate places what OWID treats as Bonaire Sint Eustatius and Saba

In [None]:
# owid_gisaid_conflicts_df[owid_gisaid_conflicts_df['Country'].isna()]['Entity'].unique()
# in owid but not in gisaid

### GISAID x WB

In [None]:
gisaid_wb_df = gisaid_owid_df.merge(wb_owid_df,how='left',left_on='Country',right_on='Economy')
# these fell out - may be things to tackle in the renaming
gisaid_wb_df[gisaid_wb_df['Income group'].isna()]['Country'].unique()

### OWID x WB

In [None]:
df = owid_df.merge(wb_df, how='left', left_on='Entity', right_on='Economy')
df

In [None]:
# keep only most recent data for each country
df = df.sort_values(by=['Entity','Day'])
df = df.drop_duplicates(subset='Entity',keep='last')

In [None]:
# remove continent data
continents = ['Africa','Antarctica','Asia','Europe','North America', 'Oceania', 'South America', 'World']
df = df[~df['Entity'].isin(continents)]

In [None]:
#df[(df['Income group'].isna()) & ~(df['people_vaccinated_per_hundred'].isna())]

In [None]:
#df[df['Income group'].isna()].Entity.unique()

In [None]:
# keep only countries with vax and pop data
df = df[~(df['Share of world population'].isna()) & ~(df['people_vaccinated_per_hundred'].isna())]

In [None]:
df[df['Income group'].isna()].sort_values('Share of world population', ascending=False)

In [None]:
df = df.sort_values(by='people_vaccinated_per_hundred', ascending=False)

## Charts

### OWID x WB chart

In [None]:
# to dictate bar widths
df['x1'] = df['Share of world population'].cumsum()
df['x0'] = df['x1'].shift(fill_value=0)

In [None]:
df[['Income group']] = df[['Income group']].fillna(value='No income data')

In [None]:
df['Income group'].unique()

In [None]:
domain = ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']
range_ = ['#2ca02c','#1f77b4', '#ff7f0e', '#d62728', '#111111']

In [None]:
vax_total = alt.Chart(df).mark_rect().encode(
    x=alt.X('x0:Q', title='Share of World Population in 2021'),
    x2='x1',
    y=alt.Y('people_vaccinated_per_hundred:Q', title='% of population with at least one dose'),
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Entity", "Day",alt.Tooltip("people_vaccinated_per_hundred",title='% with at least one dose'), alt.Tooltip("share_of_world_population",title='% share of world population'), "Income group"]
).properties(
    width=800,
    height=500
).interactive()
vax_total.save('vax-total.html')

### GISAID x WB chart

In [None]:
gisaid_wb_df

In [None]:
gisaid_wb_df.columns

In [None]:
gisaid_wb_df = gisaid_wb_df.sort_values(by='% of cases sequenced and shared', ascending=False)

#### sequencing capacity by number of cases

In [None]:
# to dictate bar widths
gisaid_wb_df['x1'] = gisaid_wb_df['Reported COVID-19 cases'].cumsum()
gisaid_wb_df['x0'] = gisaid_wb_df['x1'].shift(fill_value=0)

In [None]:
gisaid_wb_df[['Income group']] = gisaid_wb_df[['Income group']].fillna(value='No income data')

In [None]:
gisaid_wb_df['Income group'].unique()

In [None]:
domain = ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']
range_ = ['#2ca02c','#1f77b4', '#ff7f0e', '#d62728', '#111111']

In [None]:
seq_total = alt.Chart(gisaid_wb_df).mark_rect().encode(
    x=alt.X('x0:Q', title='COVID-19 Cases'),
    x2='x1',
    y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared'),
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
).properties(
    width=800,
    height=500
).interactive()
seq_total
# seq_total.save('cases-total-sequenced.html')

#### minimap test

In [None]:
# to dictate bar widths
gisaid_wb_df['x1'] = gisaid_wb_df['Reported COVID-19 cases'].cumsum()
gisaid_wb_df['x0'] = gisaid_wb_df['x1'].shift(fill_value=0)

In [None]:
gisaid_wb_df[['Income group']] = gisaid_wb_df[['Income group']].fillna(value='No income data')

In [None]:
gisaid_wb_df['Income group'].unique()

In [None]:
domain = ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']
range_ = ['#2ca02c','#1f77b4', '#ff7f0e', '#d62728', '#111111']

In [None]:
seq_total = alt.Chart(gisaid_wb_df).mark_rect().encode(
    x=alt.X('x0:Q', title='COVID-19 Cases'),
    x2='x1',
    y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared'),
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
).properties(
    width=800,
    height=500
).interactive()
seq_total
# seq_total.save('cases-total-sequenced.html')

In [None]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['x'], empty='none')

# The basic line
line = alt.Chart(source).mark_line(interpolate='basis').encode(
    x='x:Q',
    y='y:Q',
    color='category:N'
)

seq_total = alt.Chart(gisaid_wb_df).mark_rect().encode(
    x=alt.X('x0:Q', title='COVID-19 Cases'),
    x2='x1',
    y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared'),
    color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
)
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(gisaid_wb_df).mark_rect().encode(
    x=alt.X('x0:Q'),
    opacity=alt.value(0),
).add_selection(
    nearest
)

# Draw points on the line, and highlight based on selection
points = line.mark_rect().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'y:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart(gisaid_wb_df).mark_rule(color='gray').encode(
    x='x0:Q',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
alt.layer(
    line, seq_total, selectors, points, rules, text
).properties(
    width=600, height=300
)

# seq_total
# seq_total.save('cases-total-sequenced.html')

In [None]:
zoom = alt.selection_interval(encodings=["x", "y"])

minimap = (
    alt.Chart(gisaid_wb_df).mark_rect().add_selection(zoom).encode(
        x=alt.X('x0:Q', title='COVID-19 Cases'),
        x2='x1',
        y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared'),
        color= alt.condition(zoom, 'Income group', alt.value("lightgray")),
        #alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_), ,
        tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
    ).properties(
        width=200,
        height=200,
        title="Minimap -- click and drag to zoom in the detail view",
    )
)

# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['x'], empty='none')

# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = detail.encode(
    x='x0:Q',
    opacity=alt.value(0),
).add_selection(
    nearest
)


detail = (
    alt.Chart(gisaid_wb_df).mark_rect().encode(
        x=alt.X('x0:Q', title='COVID-19 Cases', scale=alt.Scale(domain={"selection": zoom.name, "encoding": "x"})),
        x2='x1',
        y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared', scale=alt.Scale(domain={"selection": zoom.name, "encoding": "y"})),
        color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
        tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
    ).properties(width=600, height=400, title="sequencing capacity by number of cases -- detail view")
)

# .mark_text(align='left', dx=5, dy=-5).encode(
#     text=alt.condition(nearest, "Country", alt.value(' '))
# )

alt.layer(
    detail, selectors, text
).properties(
    width=600, height=300
)
# detail.add_selection(nearest)

detail | minimap

In [None]:
zoom = alt.selection_interval(encodings=["x", "y"])

minimap = (
    alt.Chart(gisaid_wb_df).mark_rect().add_selection(zoom).encode(
        x=alt.X('x0:Q', title='COVID-19 Cases'),
        x2='x1',
        y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared'),
        color= alt.condition(zoom, 'Income group', alt.value("lightgray")),
        #alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_), ,
        tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
    ).properties(
        width=200,
        height=200,
        title="Minimap -- click and drag to zoom in the detail view",
    )
)

detail = (
    alt.Chart(gisaid_wb_df).mark_rect().encode(
        x=alt.X('x0:Q', title='COVID-19 Cases', scale=alt.Scale(domain={"selection": zoom.name, "encoding": "x"})),
        x2='x1',
        y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared', scale=alt.Scale(domain={"selection": zoom.name, "encoding": "y"})),
        color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
        tooltip=["Country", 'Reported COVID-19 cases', 'Sequences shared', "% of cases sequenced and shared","Income group"]
    ).properties(
        width=800,
        height=500,
    ).interactive(bind_x=False)
    .properties(width=600, height=400, title="sequencing capacity by number of cases -- detail view")
)

detail | minimap

In [None]:
gisaid_wb_df[['Region']] = gisaid_wb_df[['Region']].fillna(value='No region data')

#### sequencing capacity by total cases


In [None]:
df = gisaid_wb_df.copy()

In [None]:
df.columns

In [None]:
df = df.rename({'Sequences shared':'sequences_shared','Reported COVID-19 cases':'reported_covid_cases'}, axis=1)

In [None]:
df['Region'].unique()

In [None]:
# faceting manually because otherwise each x axis will be total pop (100%)

charts = []
for c in df['Region'].unique():
  c_df = df[df['Region'] == c]
  # import pdb; pdb.set_trace()
  # scales = c + '_scale'
  # scales = alt.selection_interval(bind='scales')
  c = alt.Chart(c_df).transform_window(
      x1='sum(reported_covid_cases)',
      sort=[alt.SortField('% of cases sequenced and shared',order='descending')]
  ).transform_calculate(
      x0='datum.x1 - datum.reported_covid_cases'
  ).mark_rect().encode(
      x=alt.X('x0:Q', title='Total COVID Cases'), #,scale=alt.Scale(domain=(0,60))),
      x2='x1',
      y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared',scale=alt.Scale(domain=(0,100))),
      color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Country", 'reported_covid_cases', 'sequences_shared', "% of cases sequenced and shared","Income group"]
  ).properties(
      title=c,
      width=400,
      height=250
  ).interactive()
  # .add_selection(
  #   scales
  # )
  # c.display()
  charts.append(c)
alt.concat(*charts)

In [None]:
first_row = alt.concat(*charts[0:4])
second_row = alt.concat(*charts[4:])
chart = alt.vconcat(first_row,second_row)
chart.save('cases-sequenced-by-region.html')

#### sequencing capacity by total sequences


In [None]:
df = gisaid_wb_df.copy()
df = df.rename({'Sequences shared':'sequences_shared'}, axis=1)

In [None]:
df['Region'].unique()

In [None]:
# faceting manually because otherwise each x axis will be total pop (100%)

charts = []
for c in df['Region'].unique():
  c_df = df[df['Region'] == c]
  # import pdb; pdb.set_trace()
  # scales = c + '_scale'
  # scales = alt.selection_interval(bind='scales')
  c = alt.Chart(c_df).transform_window(
      x1='sum(sequences_shared)',
      sort=[alt.SortField('% of cases sequenced and shared',order='descending')]
  ).transform_calculate(
      x0='datum.x1 - datum.sequences_shared'
  ).mark_rect().encode(
      x=alt.X('x0:Q', title='share of cases sequenced and shared'), #,scale=alt.Scale(domain=(0,60))),
      x2='x1',
      y=alt.Y('% of cases sequenced and shared:Q', title='% of cases sequenced and shared',scale=alt.Scale(domain=(0,100))),
      color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
    tooltip=["Country", 'sequences_shared', 'Reported COVID-19 cases', "% of cases sequenced and shared","Income group"]
  ).properties(
      title=c,
      width=400,
      height=250
  ).interactive()
  # .add_selection(
  #   scales
  # )
  # c.display()
  charts.append(c)
alt.concat(*charts)

In [None]:
first_row = alt.concat(*charts[0:4])
second_row = alt.concat(*charts[4:])
chart = alt.vconcat(first_row,second_row)
chart.save('sequencing_by_region.html')

#### random GISAID x WB viz

In [None]:
# alt.Chart(gisaid_wb_df).mark_bar().encode(
#     x="Country",
#     y='% of cases sequenced and shared'
# )
alt.Chart(gisaid_wb_df).mark_circle(size=60).encode(
    x='Reported COVID-19 cases',
    y='% of cases sequenced and shared',
    color='Region',
    tooltip=["Country", 'Sequences shared', 'Reported COVID-19 cases', "% of cases sequenced and shared","Income group"]
).interactive()

## OWID x WB facets

In [None]:
df = df.rename({'Share of world population':'share_of_world_population'}, axis=1)

In [None]:
# faceting manually because otherwise each x axis will be total pop (100%)

charts = []
for c in df['Continent'].unique():
  c_df = df[df['Continent'] == c]
  # scales = c + '_scale'
  # scales = alt.selection_interval(bind='scales')
  c = alt.Chart(c_df).transform_window(
      x1='sum(share_of_world_population)',
      sort=[alt.SortField('people_vaccinated_per_hundred',order='descending')]
  ).transform_calculate(
      x0='datum.x1 - datum.share_of_world_population'
  ).mark_rect().encode(
      x=alt.X('x0:Q', title='% share of world population',scale=alt.Scale(domain=(0,60))),
      x2='x1',
      y=alt.Y('people_vaccinated_per_hundred:Q', title='% of population with at least one dose',scale=alt.Scale(domain=(0,100))),
      color=alt.Color('Income group', scale=alt.Scale(domain=domain, range=range_)),
      tooltip=["Entity", alt.Tooltip("people_vaccinated_per_hundred",title='% with at least one dose'), alt.Tooltip("share_of_world_population",title='% share of world population'), "Income group"]
  ).properties(
      title=c,
      width=400,
      height=250
  ).interactive()
  # .add_selection(
  #   scales
  # )
  # c.display()
  charts.append(c)
alt.concat(*charts)

In [None]:
charts[3:]

In [None]:
first_row = alt.concat(*charts[0:3])
second_row = alt.concat(*charts[3:])
chart = alt.vconcat(first_row,second_row)

In [None]:
chart.save('chart.html')

### by income status

In [None]:
df['Income group'].unique()

In [None]:
continent_domain = ['Europe', 'Asia', 'North America', 'South America', 'Africa','Oceania']
continent_range_ = ['#828CA1','#72A9AB', '#EB8582', '#B56A71', '#B479AE','#A8633C']

In [None]:
charts = []
for c in ['High income', 'Upper middle income', 'Lower middle income', 'Low income', 'No income data']:
  c_df = df[df['Income group'] == c]
  c = alt.Chart(c_df).transform_window(
      x1='sum(share_of_world_population)',
      sort=[alt.SortField('people_vaccinated_per_hundred',order='descending')]
  ).transform_calculate(
      x0='datum.x1 - datum.share_of_world_population'
  ).mark_rect().encode(
      x=alt.X('x0:Q', title='% share of world population'),
      x2='x1',
      y=alt.Y('people_vaccinated_per_hundred:Q', title='% of population with at least one dose',scale=alt.Scale(domain=(0,100))),
      color=alt.Color('Continent', scale=alt.Scale(domain=continent_domain, range=continent_range_)),
      tooltip=["Entity", "people_vaccinated_per_hundred", "Income group"]
  ).properties(
      title=c,
      width=400,
      height=250
  )
  # c.display()
  charts.append(c)
alt.concat(*charts)

In [None]:
first_row = alt.concat(*charts[0:3])
second_row = alt.concat(*charts[3:])
alt.vconcat(first_row,second_row)

In [None]:
alt.Chart(df).transform_window(
    x1='sum(share_of_world_population)',
    sort=[alt.SortField('people_vaccinated_per_hundred')]
).transform_calculate(
    x0='datum.x1 - datum.share_of_world_population'
).mark_rect().encode(
    x=alt.X('x0:Q', title='Share of world population'),
    x2='x1',
    y=alt.Y('people_vaccinated_per_hundred:Q', title='% of population with at least one dose'),
    color='Income group:N',
    tooltip=["Entity", "people_vaccinated_per_hundred", "Income group"]
)
# .facet(
#     facet='Continent:N'
# )

In [None]:
alt.Chart(df).mark_rect().encode(
    x=alt.X('x0:Q', title='Share of World Population'),
    x2='x1',
    y=alt.Y('people_vaccinated_per_hundred:Q', title='People Vaccinated Per 100'),
    color='Income group:N',
    tooltip=["Entity", "people_vaccinated_per_hundred", "Income group"]
)