# Note book to create html pages for countries and Kreise in Germany

New strategy (with pelican)

- put notebooks into wwwroot/ipynb folder
- put html into html wwwroot/folder
- pelican files can then go into wwwroot folder

Advantages:
- cleaner than all in one folder
- github can display all files in each subdirectory (there is a limit of 500 files or so)

In [None]:
import sys

from multiprocessing import Pool, cpu_count

from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import HTMLExporter
from nbconvert.writers import FilesWriter

import nbformat

import os
import shutil

#  Set to false if you do not want multiprocessing enabled
cores = 'auto'
if cores == 'auto':
    cores = max(1, cpu_count()-2)

if cores:
    print(f'Using {cores} processes')

In [None]:
%config InlineBackend.figure_formats = ['svg']
%cp -v ../coronavirus.py .
from coronavirus import *
pd.set_option('display.float_format', '{:.2f}'.format)  #  Disable pandas scientific notation
# force download of new data
clear_cache()

In [None]:
# Also delete date in place where notebooks are executed
!rm -rf wwwroot/ipynb/cachedir

In [None]:
d, c = fetch_deaths(), fetch_cases()

countries = d.index
countries2 = c.index
assert (countries2 == countries).all()

In [None]:
def modify_template(templatefile, output_file_name, mappings, wwwroot):
    """Create concrete *.ipynb file from template
    - templatefile: the template with placeholders to be substituted
    - mappings: dictiorany with placeholders as keys, and values to be substituted
    - output_file_name: name to write modified file to
    - wwwroot: directory in which the output file should be written
    """
    # open template
    with open(templatefile, "tr") as f_template:
        template = f_template.read()
    for key in mappings:
        template = template.replace(key, mappings[key])
    with open(os.path.join(wwwroot, output_file_name), "tw") as f:
        f.write(template)
    print(f"Written file to {output_file_name}")


In [None]:
def check_country_name_is_known(name):
    d = fetch_deaths()
    assert name in d.index, f"{name} is unknown. Known countries are {sorted(d.index)}"

def germany_check_region_name_is_known(name):
    d = fetch_data_germany()
    assert name in list(d['Bundesland'].drop_duplicates()), \
        f"{name} is unknown. Known regions are {sorted(list(d['Bundesland'].drop_duplicates()))}"

def germany_check_subregion_name_is_known(name):
    d = fetch_data_germany()
    assert name in list(d['Landkreis'].drop_duplicates()), \
        f"{name} is unknown. Known regions are {sorted(list(d['Landkreis'].drop_duplicates()))}"

germany_check_region_name_is_known("Hamburg") 
germany_check_subregion_name_is_known("SK Hamburg") 

    
def sanitise(name):
    """Given a country name as a string, sanitise it for use as URL and filename: 
    - get rid of spaces, commas
    
    return cleaned string.
    
    (Leave umlaute for now)
    """
    s = name.replace(" ", "-")
    s = s.replace(",", "-")
    return s
    
    
def get_binder_url(notebook):
    """Given a notebook name, compute the path"""
    base = "https://mybinder.org/v2/gh/fangohr/coronavirus/master?filepath=ipynb/"
    return base + notebook.replace(" ", "%20")


def create_ipynb_for_country(country, templatename, wwwroot):
    """Creates ipynb file for country, based on templatename. 
    File is based in ipynb subfolder of wwwroot.
    Returns name of file."""
    
    # create ipynb folder if required
    ipynb_dir = os.path.join(wwwroot, "ipynb")
    if not os.path.exists(ipynb_dir):
        os.mkdir(ipynb_dir)
        
    
    check_country_name_is_known(country)
    
    output_file_name =  f"{country}.ipynb"
    output_file_path = os.path.join(wwwroot, "ipynb", output_file_name)
    
    # country = sanitize(country)
    mappings = {
        "%title%" : country,
        "%title2%" : "",
        "%country%" : country,
        "%binderurl%" : get_binder_url(output_file_name),
        "%create_date%" : datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    }

    modify_template(templatename, os.path.join("ipynb", output_file_name), mappings, wwwroot)
    assert os.path.exists(output_file_path), f"{output_file_path} does not exist"
    return output_file_name

def create_ipynb_for_germany(region, subregion, templatename, wwwroot):
    """Creates ipynb file for region and subregion in Germany, based on templatename. 
    File is based in ipynb subfolder of wwwroot.
    Returns name of file."""
    germany_check_region_name_is_known(region)
    germany_check_subregion_name_is_known(subregion)
    
    output_file_name =  f"Germany-{sanitise(region)}-{sanitise(subregion)}.ipynb"
    output_file_path = os.path.join(wwwroot, "ipynb", output_file_name)
    
    # country = sanitize(country)
    mappings = {
        "%title%" : f"Germany: {subregion} ({region})",
        "%title2%" : "",
        "%region%" : region,
        "%subregion%" : subregion,
        "%binderurl%" : get_binder_url(output_file_name),
        "%create_date%" : datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    }

    modify_template(templatename, os.path.join("ipynb", output_file_name), mappings, wwwroot)
    assert os.path.exists(output_file_path), f"{output_file_path} does not exist"
    return output_file_name


In [None]:
nb_executor = ExecutePreprocessor()
nb_executor.allow_errors = True

html_exporter = HTMLExporter()
html_writer = FilesWriter()

In [None]:
def nb_convert_html(nb_path, outdir):
    filename = os.path.basename(nb_path)
    outpath = os.path.join(outdir, os.path.splitext(filename)[0])
    with open(nb_path) as f:
        nb = nbformat.read(f, as_version=4)
        nb = nb_executor.preprocess(nb)[0]
        body, resources = html_exporter.from_notebook_node(nb)
        html_writer.write(body, resources, outpath)

In [None]:
def nbconvert_ipynb2html(ipynb_name, wwwroot):
    """Given the name of a a notebook (such as "germany.ipynb"), create the 
    corresponding html file ("html/germany.html") from the notebook file in 
    "ipynb" and return the name of the file (i.e. germany.html).
    """
    ipynb_dir = os.path.join(wwwroot, "ipynb")
    
    # copy file to run the notebook
    shutil.copyfile("../coronavirus.py", os.path.join(ipynb_dir, "../coronavirus.py"))
    
    # copy requirements (needed for binder)
    shutil.copyfile("../requirements.txt", os.path.join(ipynb_dir, "../requirements.txt"))
    
    # execute notebook and create html copy from it
    nb_convert_html(
        os.path.join(ipynb_dir, ipynb_name),
        os.path.join(wwwroot, "html")
    )

    # compute output path
    output_file_name = os.path.splitext(ipynb_name)[0] + ".html"
    assert os.path.exists(os.path.join(wwwroot, "html", output_file_name))
    
    return output_file_name
    

In [None]:
def create_md_index_list(title, links):
    lines = []
    lines.append(title)
    lines.append("")   # need empty line for markdown syntax

    death_table = d.drop(columns=["Province/State", "Lat", "Long"]).sum(axis=1, skipna=True).to_frame(name="Total deaths")
    cases_table = c.drop(columns=["Province/State", "Lat", "Long"]).sum(axis=1, skipna=True).to_frame(name="Total cases")
    index_table = cases_table.join(death_table)
    index_table = index_table.groupby("Country/Region").sum()
    
    rename_dict = {}
    for name, (name_html, name_ipynb) in links.items():
        path_html = os.path.join('html', name_html)
        rename_dict[name] = f"[{name}]({path_html})"

    index_table = index_table.rename(index=rename_dict)
    index_table["Total deaths"] = index_table["Total deaths"].apply(lambda x: '{:d}'.format(x)) #  Disable pandas scientific notation
    index_table["Total cases"] = index_table["Total cases"].apply(lambda x: '{:d}'.format(x)) #  Disable pandas scientific notation

    return index_table.to_markdown()

def test_create_md_index_list():
    title = "Title"
    links = {"Afghanistan" : ("Afghanistan.html", "Afghanistan.ipynb")}
    assert create_md_index_list(title, links).split("\n")[2].split("|")[1] == ' [Afghanistan](html/Afghanistan.html) '
    
test_create_md_index_list()

In [None]:
def get_country_list():
    d, c = fetch_deaths(), fetch_cases()

    countries = d.index
    countries2 = c.index
    assert (countries2 == countries).all()
    
    # Here we should identify regions in countries, and process those.
    # Instead, as a quick hack to get started, we'll just take one country
    # and the current "get_country" method will sum over all regions of one country if only 
    # the country name is given.
    
    return sorted(countries.drop_duplicates())
    

In [None]:
def create_index_page(sections, rootname, wwwroot):
    """Sections is dictionary: key is title, value is markdown text"""
    md_file = rootname + ".md"
    
    with open(os.path.join(wwwroot, md_file), "tw") as f:
        for section in sections:
            f.write(f"# {section}\n\n")
            f.write(sections[section])
    print(f"Written overview to {md_file}.")
    html_file = rootname + ".html"
    subprocess.check_call(f"pandoc -t html -o {os.path.join(wwwroot, html_file)} " +
                          f"{os.path.join(wwwroot, md_file)}", shell=True)
    return html_file

In [None]:
def get_germany_subregion_list():
    """returns list of subregions (Kreise), 
    ordered according to (i) Land, then (ii) Kreis
    """
    x = fetch_data_germany()
    land_kreis = x[['Bundesland', 'Landkreis']]
    ordered = land_kreis.sort_values(['Bundesland', 'Landkreis'])
    return list(ordered['Landkreis'].drop_duplicates())
 

@joblib_memory.cache
def germany_get_bundesland_from_kreis(kreis):
        x = fetch_data_germany()
        return x[x['Landkreis'] == kreis].iloc[0]['Bundesland']    

In [None]:
def does_wwwroot_exist(wwwroot):
    if not os.path.exists(wwwroot):
        msg = "To put the html into github repo for webhosting, run "
        msg += '"git clone git@github.com:fangohr/coronavirus.git wwwroot" or similar'
        # os.mkdir(wwwroot)
        raise ValueError(f"directory {wwwroot} missing.")

In [None]:
def create_html_for_john_hopkins_countries(countries, wwwroot):
    """countries: list of strings with countrie names
    
    returns dictionary: keys are countrynames, values are tuples with path to html file and path to notebook"""

    start_time = time.time()
    does_wwwroot_exist(wwwroot)
    created_files = {}

    for i, country in enumerate(countries):
        try:
            print(f"Processing {i+1}/{len(countries)} [{time.time()-start_time:4.0f}s]")
            ipynb_name = create_ipynb_for_country(country, "template-country.ipynb", wwwroot=wwwroot)
            html_name = nbconvert_ipynb2html(ipynb_name, wwwroot=wwwroot)
            created_files[country] = html_name, ipynb_name
        except Exception as e:
            print(f"Error for {country}", end='')
            print(e)

        sys.stdout.flush()
        
    print(f"Create {len(countries)} notebooks and html versions in " + \
          f"{time.time()-start_time} seconds")
    
    sys.stdout.flush()
    return created_files

In [None]:
def parallel_html_for_countries(countries, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(countries) % processes)
    countries = countries + ([0] * padding)
    per_process = int(len(countries)/processes)
    countries_per_process = list(countries[i:i+per_process] for i in range(0, len(countries), per_process))
    countries_per_process[-1] = countries_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in countries_per_process)
    
    res = pool.starmap(create_html_for_john_hopkins_countries, tasks)
    sys.stdout.flush()
    combined_res = {}
    
    [combined_res.update(r) for r in res]
    
    return combined_res

In [None]:
def create_markdown_index_page(md_content, title, pelican_file_path, 
                               save_as, wwwroot):
    """Create pelican markdown file, like this:
    
    title: Germany
    category: Data
    tags: data, plots
    save-as: germany
    date: 2020-04-11 08:00
    """

    with open(os.path.join(pelican_file_path), "tw") as f:
        f.write(f"title: {title}\n")
        f.write(f"category: Data\n")
        f.write(f"tags: data, plots\n")
        f.write(f"save-as: {save_as}\n")
        date_time = datetime.datetime.now().strftime("%Y/%m/%d %H:%M")
        f.write(f"date: {date_time}\n")
        f.write("\n")
        f.write("\n")
        f.write(md_content)
        f.write("\n")


# Create country overview for the world

In [None]:
wwwroot = "wwwroot"
countries = get_country_list()
if cores:
    with Pool(cores) as pool:
        created_files = parallel_html_for_countries(countries, wwwroot, pool)
else:
    created_files = create_html_for_john_hopkins_countries(countries, wwwroot)

index_md = create_md_index_list("Countries", created_files)
create_markdown_index_page(index_md, title="World", 
                           pelican_file_path="pelican/content/world.md", save_as="world", 
                           wwwroot=wwwroot)

# Create list of Germany data sets

In [None]:
def create_html_for_Germany(subregions, wwwroot):
    does_wwwroot_exist(wwwroot)
    start_time = time.time()
    created_files = {}

    for i, kreis in enumerate(subregions):
        try:
            bundesland = germany_get_bundesland_from_kreis(kreis)
            print(f"Processing {i+1}/{len(subregions)} [{time.time()-start_time:4.0f}s]")
            ipynb_name = create_ipynb_for_germany(region=bundesland, subregion=kreis, 
                                                  templatename="template-germany.ipynb", wwwroot=wwwroot)
            html_name = nbconvert_ipynb2html(ipynb_name, wwwroot=wwwroot)
            one_line_summary = f"Germany: {bundesland} : {kreis}"
            created_files[one_line_summary] = html_name, ipynb_name
        except Exception as e:
            print(f"Error for {kreis}", end='')
            print(e)

        sys.stdout.flush()

    print(f"Create {len(subregions)} notebooks and html versions in" + \
          f"{time.time()-start_time} seconds")
    sys.stdout.flush()

    return created_files

In [None]:
def parallel_html_for_germany(subregions, wwwroot, pool):
    processes = pool._processes
    padding = processes - (len(subregions) % processes)
    subregions = subregions + ([0] * padding)
    per_process = int(len(subregions)/processes)
    subregions_per_process = list(subregions[i:i+per_process] for i in range(0, len(subregions), per_process))
    subregions_per_process[-1] = subregions_per_process[-1][:-padding]
    
    tasks = ((c, wwwroot) for c in subregions_per_process)
    
    res = pool.starmap(create_html_for_Germany, tasks)
    sys.stdout.flush()
    combined_res = {}
    
    [combined_res.update(r) for r in res]
    
    return combined_res

In [None]:
wwwroot = "wwwroot"
subregions = get_germany_subregion_list()

# data cleaning: on 13 April, we had a Landkreis "LK Göttingen (alt)"
# with only one data point. This causes plots to fail, because there
# is nothing to plot, and then the legend() command failed.
# We assume that the RKI labels unusual data with '(alt)', and remove those.

alt_data_sets = [x for x in subregions if "(alt)" in x.lower()]
if len(alt_data_sets) > 0:
    print(f"Removing datasets label with '(alt)': {alt_data_sets}")
    for alt in alt_data_sets:
        c, d = germany_get_region(landkreis=alt)
        print(f"  removed: {alt} : len(cases)={len(c)}, len(deaths)={len(d)}")
    subregions = [x for x in subregions if not "(alt)" in x.lower()]

if cores:
    with Pool(cores) as pool:
        created_files = parallel_html_for_germany(subregions, wwwroot, pool)
else:
    created_files = create_html_for_Germany(subregions, wwwroot)

index_md = create_md_index_list("Landkreise in Germany", 
                                created_files)

create_markdown_index_page(index_md, title="Germany", 
                           pelican_file_path="pelican/content/germany.md", save_as="germany", 
                           wwwroot=wwwroot)

In [None]:
c, d = germany_get_region(landkreis='LK Göttingen (alt)')

In [None]:
c2, d2 = germany_get_region(landkreis='LK Göttingen')