In [1]:
import pandas as pd
import numpy as np
import altair as alt
%config InlineBackend.figure_format = 'retina'

In [2]:
# Data import from @adamaltmejd on github
date_cols = ['date', 'publication_date']
data = pd.read_csv('https://raw.githubusercontent.com/adamaltmejd/covid/master/data/covid_deaths_latest.csv', parse_dates=date_cols)
data.tail(10)

Unnamed: 0,date,N,publication_date,days_since_publication,n_diff,n_diff_pct,delay
1374,2020-04-30,40,2020-05-05,5.0,7.0,0.212121,5-6 Days
1375,2020-05-01,32,2020-05-05,4.0,10.0,0.454545,3-4 Days
1376,2020-05-02,27,2020-05-05,3.0,8.0,0.421053,3-4 Days
1377,2020-05-03,31,2020-05-05,2.0,11.0,0.55,2 Days
1378,2020-05-04,19,2020-05-05,1.0,16.0,5.333333,1 Day
1379,2020-05-05,4,2020-05-05,0.0,4.0,,Same day
1380,2020-04-11,0,2020-04-11,0.0,0.0,0.0,Same day
1381,2020-04-25,0,2020-04-25,0.0,0.0,0.0,Same day
1382,2020-04-26,0,2020-04-26,0.0,0.0,0.0,Same day
1383,2020-05-01,0,2020-05-01,0.0,0.0,0.0,Same day


In [3]:
# Data binning, processing and labeling
bins = [-1,0,1,2,3,4,5,6,7,8,9,10,11,100]
labels = ['Same day','1','2','3','4','5','6','7','8','9','10','11','12+']

data['lag'] = pd.cut(data['days_since_publication'], bins, labels=labels)
data['age'] = data.days_since_publication * data.n_diff

In [14]:
df = data[data['date'] > '2020-03-12']

alt.Chart(df, height=500, width=1000).mark_bar().encode(
    x=alt.X('monthdate(date)', title='Date Deceased'),
    y=alt.Y('n_diff', title='Deceased'),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'days_since_publication',
      sort='ascending'
    ),
    color=alt.Color(
        'lag:O',
        title='Lag in Days',
        sort=labels,
        scale=alt.Scale(scheme='category20c'),
    ),
    tooltip=['n_diff', 'lag']
).interactive()

In [5]:
df = pd.DataFrame(data.groupby('publication_date')['n_diff'].sum())
df['average_lag'] = data.groupby('publication_date')['age'].sum() / data.groupby('publication_date')['n_diff'].sum()
df = df.reset_index()
df = df[df['publication_date'] > '2020-04-02']

base = alt.Chart(df).encode(
    x=alt.X('publication_date', title="Publication Date"),
)

lag = base.mark_circle().encode(
    y=alt.Y('average_lag:Q', title="Average Lag in Days"),
    size=(alt.Size('n_diff', title="Reported Deaths")),
)

rolling = alt.Chart(df).mark_line(
    size=4
).transform_window(
    rolling_mean='mean(average_lag)',
    frame=[-2, 2]
).encode(
    x='publication_date',
    y='rolling_mean:Q'
)

(lag + rolling).properties(width=600)

In [6]:
df = data[data['publication_date'] > '2020-04-02']

reported = alt.Chart(df, width=700).mark_bar().encode(
    x=alt.X('monthdate(publication_date)', title="Publication Date"),
    y=alt.Y('n_diff', title='Total reported deaths'),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'days_since_publication',
      sort='ascending'
    ),
    color=alt.Color(
        'lag',
        title='Delay in Reporting',
        sort=labels,
        scale=alt.Scale(scheme='category20c'),
    ),
    tooltip=['N', 'lag']
)

reported

In [7]:
df = data.dropna(subset=['days_since_publication'])

alt.Chart(df, width=500).mark_bar().encode(
    x=alt.X('lag:O', title="Days since publication", sort=labels),
    y=alt.Y('sum(n_diff):Q', title="Reported deaths"),
    color=alt.Color(
        'day(publication_date):N',
        title="Publication Day"
    ),
).interactive()

In [8]:
df = data.dropna(subset=['days_since_publication'])

hist = alt.Chart(df, height=150, width=150).mark_bar().encode(
    x=alt.X('lag:O', title="Days Since Publication", sort=labels),
    y=alt.Y('sum(n_diff):Q', title="Reported Deaths"),
    color=alt.Color(
        'day(publication_date):N',
        title="Publication Day",
    ),
)

text = alt.Chart(df).mark_text(align='right', x=145, y=33, fontSize=28).encode(
    alt.Text('sum(n_diff)'),
)

(hist + text).facet(
    facet=alt.Facet('publication_date:T', title='Publication Date', header=alt.Header(labelFontSize=12)),
    columns=7,
).interactive()

In [9]:
df = data[data['date'] > '2020-03-10']
# Only show weekly publication dates based on latest publication date
df = df[df.publication_date.dt.dayofweek == data['publication_date'].max().weekday()]

alt.Chart(df, width=600).mark_line().encode(
    x=alt.X('date', title="Date"),
    y=alt.Y('N', title="Deceased"),
    color=alt.Color('monthdate(publication_date):N', title="Publication Date")
)

In [76]:
df = data[data['date'] > '2020-03-12']

#df.date.max().date().isoformat()
domain = ["2020-03-12", "2020-05-05"]
max_x = df.N.max()
brush = alt.selection(type='interval', encodings=['x'])

deceased = alt.Chart(df, width=600).mark_bar().encode(
    x=alt.X('yearmonthdate(date)', title='Date Deceased', scale=alt.Scale(domain=domain)),
    y=alt.Y('sum(n_diff)', title='Deceased', scale=alt.Scale(domain=[0,max_x])),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'days_since_publication',
      sort='ascending'
    ),
    color=alt.Color(
        'lag:O',
        title='Lag in Days',
        sort=labels,
        scale=alt.Scale(scheme='category20c'),
    ),
    tooltip=['n_diff', 'lag']
).transform_filter(
    brush
)

reported = alt.Chart(df, height=100, width=600).mark_bar(size=15).encode(
    x=alt.X('yearmonthdate(publication_date)', title='Publication Date'),
    y='sum(n_diff)'
).add_selection(brush)

chart = (deceased & reported).configure_scale(
    useUnaggregatedDomain=True
)

chart.save('charts/filter-publication-date.html')
chart