In [1]:
from pathlib import Path
from urllib.error import HTTPError
import datetime
import itertools

from IPython.display import display
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import re
import requests
import xmltodict

In [4]:
def get_england_deaths_df(date: datetime.date) -> pd.DataFrame:
    """
    Return a DataFrame of total daily deaths per NHS England region.
    
    The datasource is from NHS England at:
    https://www.england.nhs.uk/statistics/statistical-work-areas/covid-19-daily-deaths/
    """
    url = f"https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/" \
          f"COVID-19-total-announced-deaths-{date:%d-%B-%Y}.xlsx"
    
    deaths_df = pd.read_excel(
        url,
        sheet_name="COVID19 total deaths by region",
        header=15,
        index_col=1
    )

    
    # tidy up the raw dataframe by:
    #   1. removing any fully empty rows
    #   2. dropping extraneous columns
    #   3. converting type to int
    deaths_df = (
        deaths_df
        .dropna(how="all", axis="index")
        .dropna(how="all", axis="columns")
        .drop(["Up to 01-Mar-20", "Awaiting verification", "Total"], axis="columns")
        .drop(["England"], axis="index")
        .astype(int)
        .T
    )
    
    # smarten up the index and columns
    deaths_df.index = pd.to_datetime(deaths_df.index, dayfirst=True).rename("date")
    deaths_df = deaths_df.rename(columns={
        "East Of England": "East of England", 
        "North East And Yorkshire": "North East and Yorkshire"
    })
    
    # now give a breakdown of deaths in terms of daily cumulative total as well as the new cases per day
    columns = pd.MultiIndex.from_product([deaths_df.columns, ["deaths"], ["new"]], names=["area_name", "measure", None])
    deaths_df.columns = columns
        
    for area in deaths_df.columns.get_level_values("area_name"):
        deaths_df[area, "deaths", "total"] = deaths_df[area, "deaths", "new"].cumsum()
    
    deaths_df = deaths_df.sort_index(axis=1)

    return deaths_df

In [5]:
# deaths data taken from https://www.england.nhs.uk/statistics/statistical-work-areas/covid-19-daily-deaths/
today = datetime.date.today()
last_week = today - datetime.timedelta(days=7)
date_range = pd.date_range(start=last_week, end=today, freq="D")

for date in date_range[::-1]:
    try:
        england_deaths_df = get_england_deaths_df(date)
    except HTTPError:
        # data not available for the current day, try again with the day before
        continue
    else:
        break

england_deaths_df.tail()

area_name,East of England,East of England,London,London,Midlands,Midlands,North East and Yorkshire,North East and Yorkshire,North West,North West,South East,South East,South West,South West
measure,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths,deaths
Unnamed: 0_level_2,new,total,new,total,new,total,new,total,new,total,new,total,new,total
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
2020-04-16,68,1546,121,3793,85,2848,95,1715,128,2047,53,1590,29,644
2020-04-17,74,1620,80,3873,103,2951,78,1793,79,2126,71,1661,31,675
2020-04-18,46,1666,77,3950,95,3046,80,1873,91,2217,40,1701,20,695
2020-04-19,44,1710,67,4017,60,3106,56,1929,43,2260,31,1732,19,714
2020-04-20,9,1719,20,4037,21,3127,41,1970,20,2280,20,1752,5,719


In [6]:
# cases data taken from PHE's tracker: https://coronavirus.data.gov.uk

def get_latest_dataset_name():
    """Return the name of the latest dataset for the PHE data."""
    blob_url = "https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list"
    blobs = xmltodict.parse(requests.get(blob_url).text)
    blobs = blobs["EnumerationResults"]["Blobs"]["Blob"]
    blobs = [blob["Name"] for blob in blobs if blob["Name"].startswith("data_")]

    date_pattern = re.compile(r"\d+")
    return max(blobs, key=lambda blob: date_pattern.search(blob)[0])

# to get the latest data as used by the PHE tracker, we first need to find the name of the latest available
# dataset. These are typically named something like 202004211513.json and can be found using the
# `get_latest_dataset_name` function. The data itself is held on Azure, and can be retrieved once the name
# of the dataset is known.
latest_dataset_name = get_latest_dataset_name()
covid_data_url = f"https://c19pub.azureedge.net/{latest_dataset_name}"

covid_data = requests.get(covid_data_url).json()

# extract the relevant data, and parse it into a more user-friendly format, from the PHE dataset
england_regional_data = covid_data["regions"]
england_regional_data = {
    region["name"]["value"]: {
    #"daily_new_cases": region.get("dailyConfirmedCases", []),
    "daily_total_cases": (
        pd.DataFrame(
        region.get("dailyTotalConfirmedCases", 
                   [{"date": None, "value": None}])
        )
        .astype({"date": "datetime64"})
        .set_index("date")
        .asfreq("D", method="ffill")
        .rename(columns={"value": region["name"]["value"]})
    ),
    #"daily_new_deaths": region.get("dailyDeaths", []),
    #"daily_total_deaths": region.get("dailyTotalDeaths", []),
} for region in england_regional_data.values()}

# compile a dataframe of cases in England from the PHE JSON data
areas = england_regional_data.keys()
england_cases_df = pd.concat([england_regional_data[area]["daily_total_cases"] for area in areas], axis=1)

# the PHE data divides England up into regions which are very similar but not exactly the same as NHS England regions.
# To make it comparable to other data we are using (for deaths), it would be best to merge regions together to
# match that of the NHS England regions. This isn't 100% accurate - South Cumbria is not quite in the right region,
# for example, but by and large it is comparable.
regions_to_merge = {
    "Midlands": ["West Midlands", "East Midlands"],
    "North East and Yorkshire": ["Yorkshire and The Humber", "North East"]
}

# create the new regions to match the NHS England regions before dropping the old constituent regions
# and doing a bit of house-keeping
for new_region, old_regions in regions_to_merge.items():
    england_cases_df[new_region] = england_cases_df[old_regions].sum(axis=1)
    
england_cases_df = england_cases_df.drop(columns=itertools.chain.from_iterable(regions_to_merge.values()))
england_cases_df = england_cases_df.fillna(0.)
cols = pd.MultiIndex.from_product([england_cases_df.columns, ["cases"], ["total"]], names=["area_name", "measure", None])
england_cases_df.columns = cols

# the PHE data gives as the daily running total; add in a column for daily new cases
areas = england_cases_df.columns.get_level_values(0)
for area in areas:
    england_cases_df[area, "cases", "new"] = england_cases_df[area, "cases", "total"].diff()
    
england_cases_df = (
    england_cases_df
    .sort_index(axis=1)
    .fillna(method="bfill", axis=1)
    .astype(int)
)

england_cases_df

area_name,East of England,East of England,London,London,Midlands,Midlands,North East and Yorkshire,North East and Yorkshire,North West,North West,South East,South East,South West,South West
measure,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases
Unnamed: 0_level_2,new,total,new,total,new,total,new,total,new,total,new,total,new,total
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
2020-01-30,0,0,0,0,0,0,1,1,0,0,0,0,0,0
2020-01-31,0,0,0,0,0,0,0,1,0,0,1,1,0,0
2020-02-01,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2020-02-02,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2020-02-03,1,1,0,0,0,0,1,2,0,0,1,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-16,283,7589,474,21297,538,14502,409,12131,571,13830,599,12708,245,4564
2020-04-17,238,7827,395,21692,550,15052,383,12514,529,14359,602,13310,221,4785
2020-04-18,96,7923,251,21943,324,15376,340,12854,338,14697,248,13558,145,4930
2020-04-19,41,7964,115,22058,94,15470,160,13014,90,14787,82,13640,105,5035


In [7]:
# merge the cases and deaths dataframes into one
england_df = pd.concat([england_cases_df, england_deaths_df], axis=1, levels=["measure"])  
england_df.sort_index(axis=1).tail()

area_name,East of England,East of England,East of England,East of England,London,London,London,London,Midlands,Midlands,...,North West,North West,South East,South East,South East,South East,South West,South West,South West,South West
measure,cases,cases,deaths,deaths,cases,cases,deaths,deaths,cases,cases,...,deaths,deaths,cases,cases,deaths,deaths,cases,cases,deaths,deaths
Unnamed: 0_level_2,new,total,new,total,new,total,new,total,new,total,...,new,total,new,total,new,total,new,total,new,total
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2020-04-16,283,7589,68.0,1546.0,474,21297,121.0,3793.0,538,14502,...,128.0,2047.0,599,12708,53.0,1590.0,245,4564,29.0,644.0
2020-04-17,238,7827,74.0,1620.0,395,21692,80.0,3873.0,550,15052,...,79.0,2126.0,602,13310,71.0,1661.0,221,4785,31.0,675.0
2020-04-18,96,7923,46.0,1666.0,251,21943,77.0,3950.0,324,15376,...,91.0,2217.0,248,13558,40.0,1701.0,145,4930,20.0,695.0
2020-04-19,41,7964,44.0,1710.0,115,22058,67.0,4017.0,94,15470,...,43.0,2260.0,82,13640,31.0,1732.0,105,5035,19.0,714.0
2020-04-20,3,7967,9.0,1719.0,14,22072,20.0,4037.0,18,15488,...,20.0,2280.0,16,13656,20.0,1752.0,10,5045,5.0,719.0


In [8]:
# NHS England population statistics taken from:
# https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/clinicalcommissioninggroupmidyearpopulationestimates
#
# Scotland, Wales and Northern Ireland population statistics taken from:
# https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates
regional_populations = {
    "London": 8_908_081,
    "North West": 7_012_947,
    "North East and Yorkshire": 8_566_925,
    "Midlands": 10_537_679,
    "East of England": 6_493_188,
    "South East": 8_852_361,
    "South West": 5_605_997,
    "England": 55_977_178,
    "Wales": 3_136_400,
    "Scotland": 5_454_000,
    "Northern Ireland": 1_876_000,
}

countries = ["England", "Scotland", "Wales", "Northern Ireland"]
regional_populations["UK"] = sum(regional_populations[country] for country in countries)

In [None]:
fig = go.Figure()

areas = input_df.index.droplevel(1).unique()

confirmed_cases_traces = []
deaths_traces = []
prevalence_traces = []
case_fatality_rate_traces = []

# add a line to the figure for each area
for area in areas:
    area_data = input_df.T[[area]].reset_index().rename(columns={area: "measure", "index": "date"})
    area_data.columns = area_data.columns.rename(names=[None, None])
    
    area_data["measure", "delta_cases"] = area_data["measure", "confirmed_cases"].diff().fillna(0).astype(int)
    area_data["measure", "delta_deaths"] = area_data["measure", "deaths"].diff().fillna(0).astype(int)
    area_data["measure", "prevalence"] = area_data["measure", "confirmed_cases"] / regional_populations[area]
    area_data["measure", "case_fatality_rate"] = (
        area_data["measure", "deaths"] / area_data["measure", "confirmed_cases"]
    )
        
    area_data["text"] = (area_data["date"].dt.strftime("%d %B") + " | " + f"{area}<br>" \
                         "Cases: " + area_data["measure", "confirmed_cases"].apply("{:,}".format) + " (" + \
                         area_data["measure", "delta_cases"].apply("{:+,}".format)) + ")<br>" \
                         "Deaths: " + area_data["measure", "deaths"].apply("{:,}".format) + " (" + \
                         area_data["measure", "delta_deaths"].apply("{:+,}".format) + ")"
    
    confirmed_cases_traces.append(
        go.Scatter(
            x=area_data["date"],
            y=area_data["measure", "confirmed_cases"],
            name=area,
            # deselect the "England" and "UK" cases by default to avoid overshadowing other lines
            visible=True if area not in {"UK", "England"} else "legendonly",
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )
    
    deaths_traces.append(
        go.Scatter(
            x=area_data["date"],
            y=area_data["measure", "deaths"],
            name=area,
            # hide all the deaths traces by default
            visible=False,
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )
    
    prevalence_traces.append(
        go.Scatter(
            x=area_data["date"],
            y=area_data["measure", "prevalence"],
            name=area,
            # hide all the deaths traces by default
            visible=False,
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )
    
    case_fatality_rate_traces.append(
        go.Scatter(
            x=area_data["date"],
            y=area_data["measure", "case_fatality_rate"],
            name=area,
            # hide all the deaths traces by default
            visible=False,
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )
    
# add all the traces
for trace in confirmed_cases_traces + deaths_traces + prevalence_traces + case_fatality_rate_traces:
    fig.add_trace(trace)

# add a vertical line showing when the UK went into "lockdown"...
fig.add_shape(
    type="line",
    xref="x",
    yref="paper",
    x0=datetime.date(2020, 3, 23),
    y0=0,
    x1=datetime.date(2020, 3, 23),
    y1=0.9,
    opacity=0.8,
)    

# ...and then annotate it with some text
lockdown_annotation = dict(
    xref="x",
    yref="paper",
    x=datetime.date(2020, 3, 23),
    y=0.7,
    text="Lockdown commenced 23/03/2020",
    font_color="#000",
    showarrow=True,
    align="center",
    arrowhead=2,
    arrowsize=1,
    arrowwidth=1,
    arrowcolor="#636363",
    ax=-20,
    ay=0,
    xanchor="right",
    opacity=0.8,
)

total_cases = input_df.loc["UK", "confirmed_cases"].iloc[-1]
total_deaths = input_df.loc["UK", "deaths"].iloc[-1]
latest_prevalence = total_cases / regional_populations["UK"]
latest_case_fatality_rate = total_deaths / total_cases

total_cases_annotation = dict(
    xref="paper",
    yref="paper",
    x=0.05,
    y=0.95,
    text=f"Total UK Cases as of {input_df.columns[-1]: %-d %B %Y}:<br>" \
         f"{total_cases:,}",
    font_color="#000",
    showarrow=False,
    align="left"
)

total_deaths_annotation = dict(
    xref="paper",
    yref="paper",
    x=0.05,
    y=0.95,
    text=f"Total UK Deaths as of {input_df.columns[-1]: %-d %B %Y}:<br>" \
         f"{total_deaths:,}",
    font_color="#000",
    showarrow=False,
    align="left"
)

total_prevalence_annotation = dict(
    xref="paper",
    yref="paper",
    x=0.05,
    y=0.95,
    text=f"Total UK Period Prevalence as of {input_df.columns[-1]: %-d %B %Y}:<br>" \
         f"{latest_prevalence:.2%}",
    font_color="#000",
    showarrow=False,
    align="left"
)

total_case_fatality_annotation = dict(
    xref="paper",
    yref="paper",
    x=0.05,
    y=0.95,
    text=f"Total UK Case Fatality Rate as of {input_df.columns[-1]: %-d %B %Y}:<br>" \
         f"{latest_case_fatality_rate:.1%}",
    font_color="#000",
    showarrow=False,
    align="left"
)


fig.update_layout(
    updatemenus=[{
        "type": "buttons",
        "direction": "right",
        "x": 0.505,
        "y": 1.125,
        "buttons": list([
            {
                "label": "Cases",
                "method": "update",
                "args": [
                    {"visible": ["legendonly"] * 2 + [True] * (len(areas) - 2) + [False] * (len(areas) * 3)}, 
                    {
                        "title": "Confirmed Covid-19 Cases Per Region",
                        "yaxis": {"title": "Number of Confirmed Cases"},
                        "annotations": [lockdown_annotation] + [total_cases_annotation],
                    }
                ]
            },
            {
                "label": "Deaths",
                "method": "update",
                "args": [
                    {
                        "visible": [False] * len(areas) + ["legendonly"] * 2 + [True] * (len(areas) - 2) + [False] * (len(areas) * 2),
                    }, 
                    {
                        "title": "Covid-19 Deaths Per Region",
                        "yaxis": {"title": "Number of Deaths", "tickformat": ",d"},
                        "annotations": [lockdown_annotation] + [total_deaths_annotation],
                    }
                ]
            },
            {
                "label": "Prevalence",
                "method": "update",
                "args": [
                    {
                        "visible": [False] * len(areas) * 2 + [True] + ["legendonly"] * (len(areas) - 1) + [False] * len(areas)
                    },
                    {
                        "title": "Covid-19 Prevalence Per Region",
                        "yaxis": {"title": "Prevalence", "tickformat": ".2%"},
                        "annotations": [lockdown_annotation] + [total_prevalence_annotation],
                    }
                ],
            },
            {
                "label": "Case Fatality Rate",
                "method": "update",
                "args": [
                    {
                        "visible": [False] * len(areas) * 3 + [True] + ["legendonly"] * (len(areas) - 1)
                    },
                    {
                        "title": "Covid-19 Case Fatality Rate Per Region",
                        "yaxis": {"title": "Case Fatality Rate", "tickformat": ".1%"},
                        "annotations": [lockdown_annotation] + [total_case_fatality_annotation],
                    }
                ],
            }
        ])
    }],
    title={
        "text": "Confirmed Covid-19 Cases Per Region",
        "x": 0.45,
    },
    xaxis={
        "title": "Date",
        "tickformat": '%d %b',
        "tickangle": -45,
    },
    yaxis={
        "title": "Number of Confirmed Cases",
        "tickformat": ",d",
    },
    hovermode="closest",
    annotations=[lockdown_annotation, total_cases_annotation],
)
    
fig

In [None]:
fig = go.Figure(
    layout={
        "title": {
            "text": "Daily Number of New Covid-19 Cases Per Region",
            "x": 0.5
        },
        "xaxis": {
            "title": "Date",
            "tickformat": '%d %b',
            "tickangle": -45
        },
        "yaxis": {
            "title": "Daily Number of New Cases",
            "tickformat": ',d',
        },
        "legend": {
            "x": 0,
            "y": 1,
            "bgcolor": "RGBA(0,0,0,0)"
        },
        "hovermode": "closest"
    }
)

new_cases_traces = []
new_deaths_traces = []

areas = input_df.index.droplevel(1).unique()
for area in areas:
    area_data = input_df.T[[area]].reset_index().rename(columns={area: "measure", "index": "date"})
    area_data.columns = area_data.columns.rename(names=[None, None])
    
    area_data["measure", "new_cases"] = (
        area_data["measure", "confirmed_cases"].diff().fillna(0.).astype(int).clip(lower=0)
    )
    area_data["measure", "delta_new_cases"] = (
        area_data["measure", "new_cases"].diff().fillna(0).astype(int)
    )
    area_data["measure", "new_deaths"] = (
        area_data["measure", "deaths"].diff().fillna(0.).astype(int).clip(lower=0)
    )
    area_data["measure", "delta_new_deaths"] = (
        area_data["measure", "new_deaths"].diff().fillna(0).astype(int)
    )
        
    area_data["text"] = (
        area_data["date"].dt.strftime("%d %B") + " | " + f"{area}<br>" \
        "New cases: " + area_data["measure", "new_cases"].apply("{:,}".format) + \
        " (" + area_data["measure", "delta_new_cases"].apply("{:+,}".format) + ")<br>" \
        "New deaths: " + area_data["measure", "new_deaths"].apply("{:,}".format) + \
        " (" + area_data["measure", "delta_new_deaths"].apply("{:+,}".format) + ")"
    )
    
    new_cases_traces.append(
        go.Bar(
            x=area_data["date"],
            y=area_data["measure", "new_cases"],
            name=area,
            visible=True if area == "UK" else "legendonly",
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )
        
    new_deaths_traces.append(
        go.Bar(
            x=area_data["date"],
            y=area_data["measure", "new_deaths"],
            name=area,
            visible=False,
            text=area_data["text"],
            hovertemplate="%{text}<extra></extra>"
        )
    )

for trace in new_cases_traces + new_deaths_traces:
    fig.add_trace(trace)

# add a vertical line showing when the UK went into "lockdown"...
fig.add_shape(
    type="line",
    xref="x",
    yref="paper",
    x0=datetime.date(2020, 3, 23),
    y0=0,
    x1=datetime.date(2020, 3, 23),
    y1=0.9,
    opacity=0.8,
)    

# ...and then annotate it with some text
fig.add_annotation(
    xref="x",
    yref="paper",
    x=datetime.date(2020, 3, 23),
    y=0.45,
    text="Lockdown commenced 23/03/2020",
    font_color="#000",
    showarrow=True,
    align="center",
    arrowhead=2,
    arrowsize=1,
    arrowwidth=1,
    arrowcolor="#636363",
    ax=-20,
    ay=0,
    xanchor="right",
    opacity=0.8,
)

fig.update_layout(
    updatemenus=[{
        "type": "buttons",
        "direction": "right",
        "x": 0.225,
        "y": 1.125,
        "buttons": list([
            {
                "label": "New Cases",
                "method": "update",
                "args": [
                    {
                        "visible": [True] + ["legendonly"] * (len(areas) - 1) + [False] * len(areas)
                    }, 
                    {
                        "title": "Daily Number of New Covid-19 Cases Per Region",
                        "yaxis": {"title": "Number of New Confirmed Cases", "tickformat": ",d"},
                    }
                ]
            },
            {
                "label": "New Deaths",
                "method": "update",
                "args": [
                    {
                        "visible": [False] * len(areas) + ["True"] + ["legendonly"] * (len(areas) - 1),
                    }, 
                    {
                        "title": "Daily Number of New Covid-19 Deaths Per Region",
                        "yaxis": {"title": "Number of New Deaths", "tickformat": ",d"},
                    }
                ]
            }
        ])
    }]
)
    
fig