## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import json
import os
import shutil
import sys
from multiprocessing import Pool, cpu_count

import ipynb_py_convert
import nbformat
import numpy as np
from nbconvert import HTMLExporter
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.writers import FilesWriter

from itertools import compress

In [5]:
from report_generators.reporters import CountryReport, GermanyReport, USAReport

In [None]:
debug = True

In [6]:
#  Set to false if you do not want multiprocessing enabled
cores = 'auto'

if cores == 'auto':
    cores = max(1, cpu_count())
    # try at most 4 to reduce probability of error message like
    # the one shown at https://github.com/jupyter/jupyter_client/issues/541
    cores = min(cores, 4)


if cores:
    print(f'Using {cores} processes')
    
wwwroot = "wwwroot"

Using 4 processes


In [7]:
%config InlineBackend.figure_formats = ['svg']

from coronavirus import *
from coronavirus import MetadataRegion

pd.set_option('display.float_format', '{:.2f}'.format)  #  Disable pandas scientific notation

Cleaning of cache and copying files has moved to

- `generate-webpage-clean-setup.py` and 
- `generate-webpage-clean-setup.sh`

In [8]:
TITLE_PREFIX = "Tracking plots: "

### Download Datasets

In [9]:
d, c = fetch_deaths(), fetch_cases()

countries = d.index
countries2 = c.index
assert (countries2 == countries).all()

In [10]:
data_US_cases = fetch_cases_US()
data_US_deaths = fetch_deaths_US()

In [11]:
# also fetch data from Germany, so it is available later from the cache
germany = fetch_data_germany()

### Generic Functions

In [12]:
def does_wwwroot_exist(wwwroot):
    if not os.path.exists(wwwroot):
        msg = "To put the html into github repo for webhosting, run "
        msg += '"git clone git@github.com:oscovida/oscovida.github.io.git wwwroot" or similar'
        # os.mkdir(wwwroot)
        raise ValueError(f"directory {wwwroot} missing.")

## Index Page Generation

In [13]:
def create_markdown_index_list(category):
    """Assemble a markdown table like this:
    
    | Country/Region                       | Total cases   | Total deaths   |
    |:-------------------------------------|:--------------|:---------------|
    | [Afghanistan](html/Afghanistan.html) | 1,351         | 43             |
    | [Albania](html/Albania.html)         | 678           | 27             |
    | [Algeria](html/Algeria.html)         | 3,127         | 415            |
    | [Andorra](html/Andorra.html)         | 731           | 40             |
    
    and return as string.
    """
    
    known_categories = ["world", "Germany", "US"]
               
    # gather data
    regions_all = MetadataRegion.get_all_as_dataframe()
    if category in known_categories:
        # select those we are interested in
        regions = regions_all[regions_all['category'] == category]
    elif category in ["all-regions"]:
        regions = regions_all
    else:
        
        raise NotImplementedError(f"category {category} is unknown."+
                                  f" Known values are {known_categories + ['all-regions']}")
    
    # change index to contain URLs and one-line summary in markdown syntax
    def compose_md_url(x):
        one_line_summary, html = x
        if isinstance(html, str):
            return "[" + one_line_summary + "](" + os.path.join('html', html) +")"
        elif repr(html) == 'nan':   # if html was not produced, then variable html is np.nan
            print(f"Missing html for {one_line_summary} - will not add link to html: \n{x}")
            return one_line_summary
        else:
            raise NotImplementedError("Don't know how to proceed: ", one_line_summary, html, x)

    new_index = regions[['one-line-summary', 'html-file']].apply(compose_md_url, axis=1)
    regions2 = regions.set_index(new_index)
    regions2.index.name = "Location"
    
    # select columns
    regions3 = regions2[['max-cases', 'max-deaths', 'cases-last-week']]
    regions4 = regions3.applymap(lambda v: '{:,}'.format(v))  # Thousands comma separator
    
    # rename columns
    rename_dict = {'max-cases' : 'Total cases', 
                   'max-deaths' : 'Total deaths',
                   'cases-last-week' : 'New cases last week'}
    regions5 = regions4.rename(columns=rename_dict)

    return regions5.to_markdown()


In [14]:
def create_markdown_index_page(md_content, title, pelican_file_path, 
                               save_as, wwwroot, slug=None):
    """Create pelican markdown file, like this:
    
    title: Germany
    tags: Data, Plots, Germany
    save-as: germany
    date: 2020-04-11 08:00
    """

    if slug is None:
        slug = save_as
    
    with open(os.path.join(pelican_file_path), "tw") as f:
        f.write(f"title: {title}\n")
        # f.write(f"category: Data\n")  - have stopped using categories (22 April 2020)
        f.write(f"tags: Data, Plots, {title}\n")
        f.write(f"save-as: {save_as}\n")
        f.write(f"slug: {slug}\n")
        date_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M")
        f.write(f"date: {date_time}\n")
        f.write("\n")
        f.write("\n")
        f.write(md_content)
        f.write("\n")

In [15]:
def create_index_page(sections, rootname, wwwroot):
    """Sections is dictionary: key is title, value is markdown text"""
    md_file = rootname + ".md"
    
    with open(os.path.join(wwwroot, md_file), "tw") as f:
        for section in sections:
            f.write(f"# {section}\n\n")
            f.write(sections[section])
    print(f"Written overview to {md_file}.")
    html_file = rootname + ".html"
    subprocess.check_call(f"pandoc -t html -o {os.path.join(wwwroot, html_file)} " +
                          f"{os.path.join(wwwroot, md_file)}", shell=True)
    return html_file

## Abstract Report Class

### Serial and Parallel Report Functions

In [16]:
def create_html_reports_serial(Reporter, regions, wwwroot, expiry_hours=2, force=False):
    start_time = time.time()
    does_wwwroot_exist(wwwroot)
    skipped = 0

    for i, region in enumerate(regions):
        report = Reporter(region, wwwroot=wwwroot)
        if report.metadata.last_updated_hours_ago() < expiry_hours and not force:
            print(f"Skipping {report.title} - was updated "
                  f"{report.metadata.last_updated_hours_ago():.1f} hours ago")
            skipped += 1
            continue

        report.init_metadata()
    
        try:
            print(f"Processing {i+1}/{len(regions)} [{time.time()-start_time:4.0f}s]")
            report.generate()
        except Exception as e:
            print(f"Error for {report.title}", end='')
            print(e)
            raise e

        sys.stdout.flush()
        
    print(f"Created {len(regions)-skipped} (skipped {skipped}) notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    
    sys.stdout.flush()

In [17]:
def create_html_report_parallel(Reporter, regions, wwwroot, pool, expiry_hours=2, force=False):
    processes = pool._processes
    padding = processes - (len(regions) % processes)
    regions = regions + ([None] * padding)
    per_process = int(len(regions)/processes)
    #  Weird way to create an evenly distributed list
    regions_per_process = [[] for p in range(processes)]
    [regions_per_process[p].append(r) for p, r in list(zip(list(range(processes))*per_process, regions))]
    regions_per_process = [list(filter(None.__ne__, process)) for process in regions_per_process]
    
    print(f"Using {processes} processes with tasks:")
    for n in range(pool._processes):
        if len(regions_per_process[n]) > 5:
            print(f"\t{n}: {regions_per_process[4]}")
        else:
            print(f"\t{n}: {regions_per_process[n]}")
    print("")
    sys.stdout.flush()
    
    tasks = ((Reporter, RPP, wwwroot, expiry_hours, force) for RPP in regions_per_process)
    
    res = pool.starmap(create_html_reports_serial, tasks)

In [18]:
def create_html_reports(*, Reporter, regions, wwwroot, cores=None, expiry_hours=2, force=False):
    if cores:
        with Pool(cores) as pool:
            create_html_report_parallel(Reporter, regions, wwwroot, pool, force=force)
    else:
        create_html_reports_serial(Reporter, regions, wwwroot, force=force)

## Country Report Generation

In [19]:
def get_country_list():
    d, c = fetch_deaths(), fetch_cases()

    countries = d.index
    countries2 = c.index
    assert (countries2 == countries).all()
    
    # Here we should identify regions in countries, and process those.
    # Instead, as a quick hack to get started, we'll just take one country
    # and the current "get_country" method will sum over all regions of one country if only 
    # the country name is given.
    
    return sorted(countries.drop_duplicates())
    

In [20]:
countries = get_country_list()

if debug:
    countries = countries[0:8]
    cores=4

In [22]:
create_html_reports(
    Reporter=CountryReport, regions=countries,
    wwwroot=wwwroot, cores=cores, force=True
)

Using 4 processes with tasks:
	0: ['Afghanistan', 'Angola', 'Australia', 'Bahrain', 'Belgium', 'Bolivia', 'Brunei', 'Burundi', 'Canada', 'China', 'Congo (Kinshasa)', 'Cuba', 'Diamond Princess', 'Ecuador', 'Eritrea', 'Fiji', 'Gambia', 'Greece', 'Guinea-Bissau', 'Honduras', 'Indonesia', 'Israel', 'Jordan', 'Kosovo', 'Latvia', 'Libya', 'MS Zaandam', 'Maldives', 'Mauritius', 'Mongolia', 'Namibia', 'Nicaragua', 'Norway', 'Papua New Guinea', 'Poland', 'Russia', 'Saint Vincent and the Grenadines', 'Senegal', 'Singapore', 'South Africa', 'Sudan', 'Syria', 'Thailand', 'Tunisia', 'Ukraine', 'Uzbekistan', 'Western Sahara']
	1: ['Albania', 'Antigua and Barbuda', 'Austria', 'Bangladesh', 'Belize', 'Bosnia and Herzegovina', 'Bulgaria', 'Cabo Verde', 'Central African Republic', 'Colombia', 'Costa Rica', 'Cyprus', 'Djibouti', 'Egypt', 'Estonia', 'Finland', 'Georgia', 'Grenada', 'Guyana', 'Hungary', 'Iran', 'Italy', 'Kazakhstan', 'Kuwait', 'Lebanon', 'Liechtenstein', 'Madagascar', 'Mali', 'Mexico', 'Mo

In [25]:
index_md = create_markdown_index_list("world")

create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " Countries of the world", 
    pelican_file_path="pelican/content/countries.md", save_as="countries", 
    wwwroot=wwwroot
)

## Germany Report Generation

In [19]:
def get_germany_regions_list():
    data_germany = fetch_data_germany()
    land_kreis = data_germany[['Bundesland', 'Landkreis']]
    ordered = land_kreis.sort_values(['Bundesland', 'Landkreis'])
    return ordered.drop_duplicates().values.tolist()

In [21]:
germany_regions = get_germany_regions_list()
wwwroot = "wwwroot"

# data cleaning: on 13 April, we had a Landkreis "LK Göttingen (alt)"
# with only one data point. This causes plots to fail, because there
# is nothing to plot, and then the legend() command failed.
# We assume that the RKI labels unusual data with '(alt)', and remove those.

alt_data_sets = ["(alt)" in r[1].lower() for r in germany_regions]
alt_data_sets[20] = True
if sum(alt_data_sets) > 0:
    bad_datasests = list(compress(germany_regions, alt_data_sets))
    
    print(f"Removing datasets label with '(alt)': {bad_datasests}")

    for bd in bad_datasests:
        c, d, _ = germany_get_region(landkreis=bd[1])
        print(f"\tremoved: {bd} : len(cases)={len(c)}, len(deaths)={len(d)}")

    bad_indecies = list(compress(range(len(alt_data_sets)), alt_data_sets))

    [germany_regions.pop(i) for i in bad_indecies]

Removing datasets label with '(alt)': [['Baden-Württemberg', 'LK Ortenaukreis']]
	removed: ['Baden-Württemberg', 'LK Ortenaukreis'] : len(cases)=83, len(deaths)=83


In [22]:
if debug:
    germany_regions = germany_regions[0:8]
    cores=4

In [24]:
create_html_reports(
    Reporter=Germany, regions=germany_regions,
    wwwroot=wwwroot, cores=cores, force=True
)

Using 4 processes with tasks:
	0: [['Baden-Württemberg', 'LK Alb-Donau-Kreis'], ['Baden-Württemberg', 'LK Böblingen']]
	1: [['Baden-Württemberg', 'LK Biberach'], ['Baden-Württemberg', 'LK Calw']]
	2: [['Baden-Württemberg', 'LK Bodenseekreis'], ['Baden-Württemberg', 'LK Emmendingen']]
	3: [['Baden-Württemberg', 'LK Breisgau-Hochschwarzwald'], ['Baden-Württemberg', 'LK Enzkreis']]

Processing 1/2 [   0s]
wwwroot/ipynb/Germany-Baden-Württemberg-LK-Breisgau-Hochschwarzwald.ipynb
Processing 1/2 [   0s]Written file to Germany-Baden-Württemberg-LK-Breisgau-Hochschwarzwald.ipynb

wwwroot/ipynb/Germany-Baden-Württemberg-LK-Biberach.ipynbProcessing 1/2 [   0s]

Written file to Germany-Baden-Württemberg-LK-Biberach.ipynb
wwwroot/ipynb/Germany-Baden-Württemberg-LK-Bodenseekreis.ipynb
Written file to Germany-Baden-Württemberg-LK-Bodenseekreis.ipynb
Processing 1/2 [   0s]
wwwroot/ipynb/Germany-Baden-Württemberg-LK-Alb-Donau-Kreis.ipynb
Written file to Germany-Baden-Württemberg-LK-Alb-Donau-Kreis.ipy

In [25]:
index_md = create_markdown_index_list(category="Germany")

create_markdown_index_page(
    index_md, title= TITLE_PREFIX + " Germany", 
    pelican_file_path="pelican/content/germany.md", save_as="germany", 
    wwwroot=wwwroot
)

## USA Report Generation

In [26]:
states = get_US_region_list()

In [28]:
states = states[0:8]
cores=4

In [29]:
create_html_reports(
    Reporter=USA, regions=states,
    wwwroot=wwwroot, cores=cores, force=True
)

Using 4 processes with tasks:
	0: ['Alabama', 'Arkansas']
	1: ['Alaska', 'California']
	2: ['American Samoa', 'Colorado']
	3: ['Arizona', 'Connecticut']

Processing 1/2 [   0s]Processing 1/2 [   0s]

wwwroot/ipynb/US-American-Samoa.ipynb
wwwroot/ipynb/US-Arizona.ipynbProcessing 1/2 [   0s]Written file to US-American-Samoa.ipynb


Written file to US-Arizona.ipynbwwwroot/ipynb/US-Alabama.ipynb

Processing 1/2 [   0s]Written file to US-Alabama.ipynb

wwwroot/ipynb/US-Alaska.ipynb
Written file to US-Alaska.ipynb
Written file to wwwroot/html/US-Alaska.html
Written file to wwwroot/html/US-Arizona.html
Processing 2/2 [   2s]
wwwroot/ipynb/US-California.ipynb
Processing 2/2 [   2s]
Written file to US-California.ipynbwwwroot/ipynb/US-Connecticut.ipynb

Written file to US-Connecticut.ipynb
Written file to wwwroot/html/US-American-Samoa.html
Processing 2/2 [   2s]
wwwroot/ipynb/US-Colorado.ipynb
Written file to US-Colorado.ipynb
Written file to wwwroot/html/US-Alabama.html
Processing 2/2 [   2s]


In [30]:
index_md = create_markdown_index_list("US")

create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " United States", 
    pelican_file_path="pelican/content/US.md", save_as="us", 
    wwwroot=wwwroot
)

### HTML Pages for All Regions

In [31]:
index_md = create_markdown_index_list("all-regions")
create_markdown_index_page(
    index_md, title=TITLE_PREFIX + " All regions and countries", 
    pelican_file_path="pelican/content/all-regions.md", save_as="all-regions", 
    wwwroot=wwwroot
)

## Error Reporting

In [32]:
ms = MetadataRegion.get_all()
for name in ms:
    m = MetadataRegion(name)
    dt = m.last_updated_hours_ago()
    if dt > 2:
        print(f"Problem with '{name}', last update: {dt} ago ")