In [1]:
import pandas as pd
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models.formatters import DatetimeTickFormatter, NumeralTickFormatter
from bokeh.models.tools import HoverTool
from bokeh.layouts import row
from pathlib import Path

In [2]:
!curl https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv -o covid_data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0 24.3M    0   918    0     0    541      0 13:05:02  0:00:01 13:05:01   541
 22 24.3M   22 5574k    0     0  2123k      0  0:00:11  0:00:02  0:00:09 2122k
 66 24.3M   66 16.2M    0     0  4583k      0  0:00:05  0:00:03  0:00:02 4582k
100 24.3M  100 24.3M    0     0  5715k      0  0:00:04  0:00:04 --:--:-- 5716k


In [3]:
def find_rank(data, date, all_df, column):
    '''
        data: value of column to find ranking;
        date: data to get ranking from;
        all_df: dataframe containing data of every location;
        column: feature to get ranking;
    '''
    data_list = sorted(list(set(list(all_df[all_df.date==date][column]))), reverse=True)
    return data_list.index(data)
    
def fill_ranks(location_df, all_df, columns):
    '''
        location_df: data frame with location data;
        all_df: dataframe containing data of every location;
        columns: list of features to get rankings;
    '''
    for column in columns:
        for idx, row in location_df.iterrows():
            location_df.loc[idx,'ranking_'+column] = find_rank(location_df.loc[idx,column],location_df.loc[idx,'date'], all_df, column)
  

In [4]:
def build_and_save_covid_data(raw_data, rankings_columns, remove_regions=True):
    '''
        raw_data: dataframe containing data of every location;
        rankings_columns: list of features to get rankings;
        remove_regions: bool to remove general regions from data;
    '''
    # 1. Filter regions if needed
    regions = ['International','World','Europe','North America','South America','Oceania','Asia','Africa','European Union']
    if remove_regions:
        raw_data = raw_data[~raw_data.location.isin(regions)]
    
    # 2. Fill NaN and update type
    raw_data = raw_data.fillna(0)
    raw_data['date'] = raw_data['date'].apply(pd.to_datetime)
    
    # 3. Create paths to save files
    covid_data_path = Path('..//data//COVID')
    covid_data_path.mkdir(parents=True, exist_ok=True)
    
    location_data_path = covid_data_path / 'locations'
    location_data_path.mkdir(parents=True, exist_ok=True)
        
    all_df_wrank = []
    for location, df_location in raw_data.groupby(['location']):
        fill_ranks(df_location, raw_data, rankings_columns)
        df_location.to_csv(str(location_data_path)+'//'+location, index=False)
        all_df_wrank.append(df_location)
        
    all_df_wrank = pd.concat(all_df_wrank)
    all_df_wrank.to_csv(str(covid_data_path)+'//all_covid.csv', index=False)

In [5]:
data = pd.read_csv('covid_data.csv')
data.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
       'new_vaccinations_smoothed', 'total_vaccinations_per_hun

In [8]:
fields_of_interest = ['total_cases', 'new_cases',
                      'new_cases_smoothed', 'total_deaths', 'new_deaths',
                      'new_deaths_smoothed', 'total_cases_per_million',
                      'new_cases_per_million', 'new_cases_smoothed_per_million',
                      'total_deaths_per_million', 'new_deaths_per_million',
                      'new_deaths_smoothed_per_million']
build_and_save_covid_data(data, fields_of_interest)
! rm covid_data.csv