In [16]:
import pandas as pd
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.formatters import DatetimeTickFormatter, NumeralTickFormatter
from bokeh.models.tools import HoverTool
from bokeh.layouts import row

In [17]:
#Download data (as of 2021-06-07)
!curl https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv -o covid_data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  2 23.3M    2  629k    0     0   663k      0  0:00:36 --:--:--  0:00:36  663k
 48 23.3M   48 11.3M    0     0  6015k      0  0:00:03  0:00:01  0:00:02 6011k
 97 23.3M   97 22.6M    0     0  7903k      0  0:00:03  0:00:02  0:00:01 7901k
100 23.3M  100 23.3M    0     0  7977k      0  0:00:03  0:00:03 --:--:-- 7977k


In [19]:
def find_rank(data, date, all_df, column):
    '''
        data: value of column to find ranking;
        date: data to get ranking from;
        all_df: dataframe containing data of every location;
        column: feature to get ranking;
    '''
    data_list = sorted(list(set(list(all_df[all_df.date==date][column]))), reverse=True)
    return data_list.index(data)
    
def fill_ranks(location_df, all_df, column):
    '''
        location_df: data frame with location data;
        all_df: dataframe containing data of every location;
        column: feature to get ranking;
    '''
    for idx, row in location_df.iterrows():
        location_df.loc[idx,'ranking'] = find_rank(location_df.loc[idx,column],location_df.loc[idx,'date'], all_df, column)
  

In [20]:
def plot_location_data(fig_abs, fig_norm, location_df, column, tooltips):
    '''
        figure: bokeh figure to plot line on;
        location_df: data frame with location data;
        column: data column to plot
        tooltips: list with tooltips to appear on;
    '''
    # 1. Get data from country dataframe
    location_data = list(location_df[column])
    if location_data[-1] == 0: return
    location_data_norm = [x/location_data[-1] for x in location_data]
    location      = list(location_df.location)
    location_dts  = list(location_df.date)
    location_rankings = list(location_df.ranking)
    
    # 2. Create data source
    src = ColumnDataSource(data=dict(
        x=location_dts,
        Data=location_data,
        Data_norm=location_data_norm,
        Location=location,
        Ranking=location_rankings
    ))
    
    # 3. Add field with formatted date for hover
    src.add(location_df.date.apply(lambda d: d.strftime('%Y-%m-%d')), 'date_formatted')
    
    # 4. Plot on figure
    fig_abs.line('x', 'Data', source=src, line_width=2, hover_line_color='red')
    fig_norm.line('x', 'Data_norm', source=src, line_width=2, hover_line_color='red')

In [21]:
def plot_covid_time_series(data, column):
    
    tooltips = [
        ('Location','@Location'),
        ('Date','@date_formatted'),
        (column,'@Data{(0.00a)}'),
        (column+' (norm)','@Data_norm'),
        ('Ranking','@Ranking')
    ]

    title = column

    fig_abs = figure(title=title, x_axis_label='Date', y_axis_label=title,
                     tooltips=tooltips, plot_height=800, plot_width=800)

    fig_norm = figure(title=title+' (normalized)', x_axis_label='Date', y_axis_label=title+' (normalized)',
                      tooltips=tooltips, plot_height=800, plot_width=800)

    for location, df_location in data.groupby(['location']):
        fill_ranks(df_location, data, column)
        plot_location_data(fig_abs, fig_norm, df_location, column, tooltips)

    fig_abs.xaxis.formatter = DatetimeTickFormatter(hours=["%d %b %Y"],
                                                days=["%d %b %Y"],
                                                months=["%d %b %Y"],
                                                years=["%d %b %Y"],)

    fig_abs.yaxis.formatter = NumeralTickFormatter(format='0.0a')

    fig_abs.toolbar.autohide = True

    fig_norm.xaxis.formatter = DatetimeTickFormatter(hours=["%d %b %Y"],
                                                days=["%d %b %Y"],
                                                months=["%d %b %Y"],
                                                years=["%d %b %Y"],)

    fig_norm.yaxis.formatter = NumeralTickFormatter(format='0.0a')

    fig_norm.toolbar.autohide = True

    show(row(fig_abs, fig_norm))

In [22]:
def get_ranks_distribution(location_df, n_locs):
    '''
        location_df: data frame with location data;
        n_locs: total number of locations;
    '''
    ranks_dist = []
    for i in range(n_locs):
        count_ranki = 0
        location_rankings = list(location_df['ranking'])
        for j in range(len(location_rankings)):
            if location_rankings[j] == i:
                count_ranki += 1
        n_datapts = location_df.shape[0]
        ranks_dist.append(count_ranki/n_datapts)    
    return ranks_dist

def get_ranks_agg_distribution(location_df, n_locs):
    '''
        location_df: data frame with location data;
        n_locs: total number of locations;
    '''
    ranks_dist = []
    for i in range(n_locs):
        count_ranki = 0
        location_rankings = list(location_df['ranking'])
        for j in range(len(location_rankings)):
            if location_rankings[j] <= i:
                count_ranki += 1
        n_datapts = location_df.shape[0]
        ranks_dist.append(count_ranki/n_datapts)    
    return ranks_dist

def plot_location_rankings(fig_rankings, fig_rankings_agg, location_df, n_locs, tooltips):
    '''
        fig_rankings: bokeh figure to plot line with rank distribution;
        fig_rankings: bokeh figure to plot line with rank cumulative distribution;
        location_df: data frame with location data;
        tooltips: list with tooltips to appear;
    '''
    
    # 1. Get data from team dataframe
    location_ranks_dist = get_ranks_distribution(location_df, n_locs)
    location_ranks_agg_dist = get_ranks_agg_distribution(location_df, n_locs)
    total_ranks = list(range(n_locs))
    location     = list(set(location_df.location))*len(total_ranks)
    
    # 2. Create data source
    src = ColumnDataSource(data=dict(
        x=total_ranks,
        Rankings=location_ranks_dist,
        Rankings_agg=location_ranks_agg_dist,
        Location=location,
    ))
    
    # 3. Plot on each figure
    fig_rankings.line('x', 'Rankings', source=src, line_width=2, hover_line_color='red')
    fig_rankings_agg.line('x', 'Rankings_agg', source=src, line_width=2, hover_line_color='red')
    
def plot_covid_rankings(data, column):
    '''
        data: data frame containing data from all teams;
        title: title for plot with absolute values
    '''
    
    rankings_tooltips = [
        ('Location','@Location'),
        ('Ranking','@x'),
        ('Proportion','@Rankings'),
        ('Proportion_agg','@Rankings_agg')
    ]

    fig_rankings = figure(title=column, x_axis_label='Ranking', y_axis_label='Proportion',
                          tooltips=rankings_tooltips, plot_height=800, plot_width=800)
    
    fig_rankings_agg = figure(title=column+' (normalized)', x_axis_label='Ranking', y_axis_label='Proportion (aggregated)',
                              tooltips=rankings_tooltips, plot_height=800, plot_width=800)
    
    n_locs = len(set(data.location))

    for _, location_df in data.groupby(['location']):
        fill_ranks(location_df, data, column)
        plot_location_rankings(fig_rankings, fig_rankings_agg, location_df, n_locs, rankings_tooltips)

    fig_rankings.toolbar.autohide = True
    fig_rankings_agg.toolbar.autohide = True
    
    show(row(fig_rankings, fig_rankings_agg))

In [23]:
def plot_covid_data(data, column, remove_regions=True):
    '''
        data: dataframe containing data of every location;
        column: feature to get ranking;
        remove_regions: bool to remove general regions from data;
    '''
    
    regions = ['International','World','Europe','North America','South America','Oceania','Asia','Africa','European Union']
    if remove_regions:
        data = data[~data.location.isin(regions)]
                
    plot_covid_time_series(data,column)
    plot_covid_rankings(data,column)

In [24]:
data = pd.read_csv('covid_data.csv')

data = data[['location','date',
             'total_cases','new_cases',
             'total_cases_per_million','new_cases_per_million',
             'total_deaths','new_deaths',
             'total_deaths_per_million','new_deaths_per_million',
             'total_tests','new_tests',
             'total_tests_per_thousand', 'new_tests_per_thousand',
             'total_vaccinations','new_vaccinations',
             'people_vaccinated','people_fully_vaccinated',
             'people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred']]

data = data.fillna(0)
data['date'] = data['date'].apply(pd.to_datetime)

plot_covid_data(data, 'total_deaths', remove_regions=True)