In [None]:
# default_exp core

# covid

> Utility methods to process the John Hopkins University Covid-19 dataset using [nbdev](http://nbdev.fast.ai/).

In [None]:
#hide
from nbdev.showdoc import *

This module has a dependency on the following Python libraries which have been added to the `requirements` field in `settings.ini`:
* `requests`
* `pandas`
* `matplotlib`
* `seaborn`

In [None]:
#export
import typing
from typing import List, Callable
import requests
import datetime
from datetime import date
import io
import os
from io import StringIO
import urllib.request
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

The following cell contains some defaults for `seaborn`:

In [None]:
#export
def setDefaults(figsize=(18,9)):
    sns.set_style("dark")
    sns.set(rc={'legend.fontsize':14,
                'xtick.labelsize':14,
                'ytick.labelsize':14,
                'axes.labelsize':16,
                'axes.titlesize':18,
                'figure.figsize':figsize,
               })

The following cell contains the names of the time series files published by John Hopkins University (JHU).  Note that the format of these names changed overnight on 24.03.20 without prior warning.

In [None]:
#export
ROOT      = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data'
CONFIRMED = 'time_series_covid19_confirmed_global.csv'
DEATHS    = 'time_series_covid19_deaths_global.csv'
RECOVERED = 'time_series_19-covid-Recovered.csv'

## 1. Graphing current counts

The following cell contains a couple of utility functions for returning yesterday and today as strings:

In [None]:
#export
getToday: Callable[[None], str] = lambda: date.today().strftime('%m-%d-%Y')
getYesterday: Callable[[None], str] = lambda: (date.today() - datetime.timedelta(days = 1)).strftime('%m-%d-%Y')

The following two utility functions are used to process a JHU csv file and turn it into a `pandas` dataframe.  Note that the dataset changed a couple of column names on 24.03.20.

In [None]:
#export
def procDataframe(csv: str) -> pd.DataFrame:
    ''' Convert input csv data or file to a pandas dataframe. '''
    assert(csv)
    df = pd.read_csv(csv)
    try:
        df['Province/State'].fillna('',inplace=True)
    except:
        df['Province_State'].fillna('',inplace=True)
    df.fillna(0, inplace=True)
    cols = df.columns.to_list()
    if 'Last Update' in cols:
        df['Last Update'] = df['Last Update'].apply(pd.to_datetime)
    if 'Last_Update' in cols:
        df['Last_Update'] = df['Last_Update'].apply(pd.to_datetime)
    return df

def procUrl(url: str, download: bool, localfile: str=None, force: bool=False, verbose: bool=False) -> pd.DataFrame:
    ''' Optionally download then process csv data or file at url converting it to a pandas dataframe. '''
    assert(url)
    if download:
        if os.path.exists(localfile) and not force:
            verbose and print(f'"{localfile}" already exists so will not overwrite')
        else:
            verbose and print(f'Downloading "{localfile}" from "{url}"...')
            urllib.request.urlretrieve(url, localfile)
        return procDataframe(localfile)
    else:
        s = requests.get(url).content
        return procDataframe(io.StringIO(s.decode('utf-8')))

The following cell contains a utility method to return a `pandas` dataframe with a given day's daily report and then ploty it by `kind` which can be one of `["Confirmed","Deaths","Recovered"]`:

In [None]:
#export
def getCountriesDailyReport(day: str, download: bool=False, force: bool=False) -> pd.DataFrame:
    assert(len(day))
    url = f'{ROOT}/csse_covid_19_daily_reports/{day}.csv'
    localfile = f'{day}.csv'
    return procUrl(url, download, localfile, force)

def plotCountriesDailyReport(df: pd.DataFrame, topN: int=10, color: str='y', kind: str='Confirmed') -> None:
    fig, ax = plt.subplots()
    ax = df.groupby('Country_Region')[kind].sum().sort_values(ascending=False)[:topN].\
      plot(ax=ax, kind='bar', color=color, stacked=False, figsize=(18,9))
    ax.set_ylabel('Count', size=14)
    ax.set_xlabel('Country', size=14)
    ax.set_title(f'Total {kind} by top {topN} countries as of {getYesterday()}', size=18)
    plt.show()

## 2. Graphing time series counts

The following cell contains three utility methods to return time series data for each of `["Confirmed","Deaths","Recovered"]` in a `pandas` dataframe given a url to a corresponding csv file:

In [None]:
#export
def getTimeSeriesConfirmed(download: bool=False, force: bool=False) -> pd.DataFrame:
    url = f'{ROOT}/csse_covid_19_time_series/{CONFIRMED}'
    localfile = 'time_series_19-covid-Confirmed.csv'
    return procUrl(url, download, localfile, force)

def getTimeSeriesDeaths(download: bool=False, force: bool=False) -> pd.DataFrame:
    url = f'{ROOT}/csse_covid_19_time_series/{DEATHS}'
    localfile = 'time_series_19-covid-Deaths.csv'
    return procUrl(url, download, localfile, force)

def getTimeSeriesRecovered(download: bool=False, force: bool=False) -> pd.DataFrame:
    url = f'{ROOT}/csse_covid_19_time_series/{RECOVERED}'
    localfile = 'time_series_19-covid-Recovered.csv'
    return procUrl(url, download, localfile, force)

The following cell contains methods to aggregate each of `["Confirmed","Deaths","Recovered"]` by county.  Note that `force` and `download` are both set `True` in all cases.  Note also that at time of writing `Recovered` isn't supported as a time series dataset in a csv file.

In [None]:
#export
def procTimeSeriesDataframe(r: List) -> pd.DataFrame:
    sdf = pd.DataFrame(r)
    sdf['day'] = sdf['day'].apply(pd.to_datetime)
    sdf.set_index('day', drop=True, inplace=True)
    return sdf

def procTimeSeries(df: pd.DataFrame, kind: str) -> pd.DataFrame:
    r = []
    countries = df.groupby('Country/Region')
    cols = df.columns.to_list()
    for country, group in countries:
        total = []
        for row_index, row in group.iterrows():
            rvals = row.to_list()
            if not len(total):
                total = rvals[4:]
                #print('first',total)
            else:
                #print('next',rvals[4:])
                total = [a+b for a, b in zip(total, rvals[4:])]
        for a, b in zip(cols[4:], total):
            r.append({'day':a, 'country':country, kind:b})
    return procTimeSeriesDataframe(r)

procTimeSeriesDeaths: Callable[[pd.DataFrame], pd.DataFrame] = lambda: procTimeSeries(getTimeSeriesDeaths(download=True, force=True), 'Deaths')
procTimeSeriesConfirmed: Callable[[pd.DataFrame], pd.DataFrame] = lambda: procTimeSeries(getTimeSeriesConfirmed(download=True, force=True), 'Confirmed')
#procTimeSeriesRecovered: Callable[[pd.DataFrame], pd.DataFrame] = lambda: procTimeSeries(getTimeSeriesRecovered(download=True, force=True), 'Recovered')

The following cell contains a utility plotting method for the processed and aggregated time series dataframe:

In [None]:
#export
def plotCountryTimeSeries(df: pd.DataFrame, countries: List, kind: str) -> None:
    fig, ax = plt.subplots()
    for country in countries:
        ax = df[df['country'] == country].plot(ax=ax, y=kind, kind='line', figsize=(18,9))
    ax.set_ylabel('Count', size=14)
    ax.set_xlabel('Day', size=14)
    ax.set_title(f'{kind} in {countries} as of {getYesterday()}', size=18)
    ax.legend(ax.get_lines(),countries)
    plt.show()

## 3. Graphing counts and time series via Covid API

The following cell contains a utility method for plotting a sorted stacked bar graph of country data:

In [None]:
#export
def plotCountriesDailyReportFromAPI(normalised=False):
    url = 'https://api.covid19api.com/summary'
    if normalised:
        df = pd.DataFrame(requests.get(url).json().get('Countries'))
        df.Country.replace({'Iran (Islamic Republic of)': 'Iran', 'Korea, South': 'South Korea'},inplace=True)
        cols = df.columns.to_list()
        sdf = df.groupby('Country')[cols[2:]].apply(sum).reset_index()
        sdf = sdf.sort_values(by=['TotalConfirmed'], ascending=False)
        _ = sdf[sdf.TotalDeaths > 10].plot(kind='bar', x='Country', y=['TotalConfirmed', 'TotalDeaths'],\
          color='yr', stacked=True, figsize=(18, 9)).set_title('Covid-19 cases and deaths', size=18)
    else:
        df = pd.DataFrame(requests.get(url).json().get('Countries')).\
          sort_values(by=['TotalConfirmed'], ascending=False)
        _ = df[df.TotalDeaths > 10].plot(kind='bar', x='Country', y=['TotalConfirmed', 'TotalDeaths'],\
          color='yr', stacked=True, figsize=(18, 9)).set_title('Covid-19 cases and deaths', size=18)
        
def plotCategoryByCountry(category, country, color='y'):
    url = f'https://api.covid19api.com/total/country/{country}/status/{category.lower()}'
    df = pd.DataFrame(requests.get(url).json())
    df['Date'] = df['Date'].apply(pd.to_datetime)
    df.plot(kind='line', x='Date', y='Cases', color=color, figsize=(18, 9)).\
      set_title(f'Covid-19 {category} in {country}', size=18)

## 4. Testing

From [the nbdev documentation](http://nbdev.fast.ai/test/):
> Everything that is not an exported cell is considered a test, so you should make sure your notebooks can all run smoothly (and fast) if you want to use this functionality as the CLI. 

In [None]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
