<a href="https://colab.research.google.com/github/nunrib/fch-virus-combat/blob/master/Coronavirus_data_exploration_quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plotly + Coronavirus quickstart notebook

This notebook pulls data from the [Corona Data Scraper](https://coronadatascraper.com) and generates plots.

Feel free to copy & modify it. There is also a version available as a [Gist](https://gist.github.com/thatneat/4206ee5f59171165acce87467067dae6).


**⬇ If you just want to see the pretty plots, scroll down ⬇**


### Ideas

* ☑️ plot confirmed cases over time
 * ☑️ log plot
 * ☑️ normalize start date
* ☑️ compare confirmed cases to deaths
* ☑️ look at per capita numbers
* ☑️ make a plot showing recovery (hopefully) catching up to infections (looked at active cases instead)
* ☐ somehow include number of testing kits per capita (or other indicators of response), and see how that relates to the infections/deaths rate
* ☐ make this interactive using dropdowns or something
* ☐ look at doubling rate based on latitude

In [0]:
# In colab, these dependencies are already installed.
# If you're running this locally you may need to install a few packages by uncommenting the following lines and running the cell
# %pip install plotly
# %pip install pandas

import numpy as np
import plotly.io as pio
import plotly.express as px
import pandas as pd
import cufflinks # Monkeypatch pandas DataFrames to have .iplot() functions.
nan = pd.np.nan

pio.templates.default = "plotly_white"


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



## Load raw data

In [0]:
timeseries = pd.read_csv('https://coronadatascraper.com/timeseries.csv', parse_dates=['date'])

display('Timeseries:')
display(timeseries.head())

'Timeseries:'

Unnamed: 0,name,level,city,county,state,country,population,lat,long,url,aggregate,tz,cases,deaths,recovered,active,tested,growthFactor,date
0,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,2.0,,,2.0,,,2020-01-22
1,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,3.0,,,3.0,,1.5,2020-01-23
2,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,5.0,,,5.0,,1.666667,2020-01-24
3,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,7.0,,,7.0,,1.4,2020-01-25
4,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,8.0,,2.0,6.0,,1.142857,2020-01-26


In [0]:
# the source DataFrame includes three-letter country codes, and separate city, county, state and country fields.
# Combine these into one, with the full country name to make them easy to use

country_codes = pd.read_csv('https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/raw/master/all/all.csv').set_index('alpha-3')['name']
country_codes.name = 'country'

# Override a few names that are too long or awkward
country_codes.update(pd.Series({
    'USA': 'USA',
    'GBR': 'UK',
    'KOR': 'South Korea',
}))

display('Country code mapping:')
display(country_codes.head())

def get_combined_location(row):
    location_segments = [
        row['city'], row['county'], row['state'], row['country']
    ]
    cleaned_location_segments = [
        segment
        for segment in location_segments
        if type(segment) is str
    ]
    return ', '.join(cleaned_location_segments)

cleaned_timeseries = (
    timeseries
    .rename(
        {
            'country': 'country_code'
        },
        axis='columns'
    )
    .join(country_codes, 'country_code')
)
cleaned_timeseries['location'] = cleaned_timeseries.apply(get_combined_location, axis='columns')
cleaned_timeseries

'Country code mapping:'

alpha-3
AFG       Afghanistan
ALA     Åland Islands
ALB           Albania
DZA           Algeria
ASM    American Samoa
Name: country, dtype: object

Unnamed: 0,name,level,city,county,state,country_code,population,lat,long,url,aggregate,tz,cases,deaths,recovered,active,tested,growthFactor,date,country,location
0,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,2.0,,,2.0,,,2020-01-22,Thailand,Thailand
1,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,3.0,,,3.0,,1.500000,2020-01-23,Thailand,Thailand
2,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,5.0,,,5.0,,1.666667,2020-01-24,Thailand,Thailand
3,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,7.0,,,7.0,,1.400000,2020-01-25,Thailand,Thailand
4,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,8.0,,2.0,6.0,,1.142857,2020-01-26,Thailand,Thailand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54325,"CWhyearnodkoeettCeoCuonutnyty, iso2:US-KS, iso...",county,,CWhyearnodkoeettCeoCuonutnyty,iso2:US-KS,iso1:US,,,,https://public.tableau.com/views/COVID-19Data_...,county,,1549.0,,,,,1.000000,2020-04-05,,"CWhyearnodkoeettCeoCuonutnyty, iso2:US-KS"
54326,"St Clair County, iso2:US-MI, iso1:US",county,,St Clair County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,66.0,,,,,,2020-04-04,,"St Clair County, iso2:US-MI"
54327,"St Clair County, iso2:US-MI, iso1:US",county,,St Clair County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,66.0,,,,,1.000000,2020-04-05,,"St Clair County, iso2:US-MI"
54328,"St Joseph County, iso2:US-MI, iso1:US",county,,St Joseph County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,10.0,,,,,,2020-04-04,,"St Joseph County, iso2:US-MI"


In [0]:
cleaned_timeseries[cleaned_timeseries['location'] == 'USA']

Unnamed: 0,name,level,city,county,state,country_code,population,lat,long,url,aggregate,tz,cases,deaths,recovered,active,tested,growthFactor,date,country,location


# Add calculated columns
If you want to add extra columns to analyze, this is probably the best place.

In [0]:
cleaned_timeseries['deaths/cases'] = cleaned_timeseries['deaths'] / cleaned_timeseries['cases']
cleaned_timeseries['cases per 100k capita'] = cleaned_timeseries['cases'] / cleaned_timeseries['population'] * 1e5

# Add diff columns for looking at change rate
timeseries_by_location = cleaned_timeseries.groupby('location')
for days_shift in [1,3,7]:
    for orig_column in ['recovered', 'deaths', 'cases', 'active']:
        cleaned_timeseries[f'{days_shift}d new {orig_column}'] = timeseries_by_location[orig_column].diff(periods=days_shift)


# Hack: add date in a format that plotly can deal with in hover text
cleaned_timeseries['Date'] = cleaned_timeseries['date'].apply(lambda date: date.strftime('%Y-%m-%d'))

# Support shifting things by the time that the location reached 100 as well as 1 per 100k population
day_location_reached_100 = cleaned_timeseries[cleaned_timeseries['cases']>100].groupby('location')['date'].min().to_dict()
day_location_reached_100_active = cleaned_timeseries[cleaned_timeseries['active']>100].groupby('location')['date'].min().to_dict()
day_location_reached_1_per_100k = cleaned_timeseries[cleaned_timeseries['cases per 100k capita']>1].groupby('location')['date'].min().to_dict()

def shift_dates(row, offset_by_location):
    date = row['date']
    location = row['location']
    if location in offset_by_location:
        return (date - offset_by_location[location]) / pd.Timedelta(days=1)

cleaned_timeseries['days since 100 cases'] = cleaned_timeseries.apply(
    shift_dates,
    offset_by_location=day_location_reached_100,
    axis='columns'
)
cleaned_timeseries['days since 100 active'] = cleaned_timeseries.apply(
    shift_dates,
    offset_by_location=day_location_reached_100_active,
    axis='columns'
)
cleaned_timeseries['days since 1 case/100k people'] = cleaned_timeseries.apply(
    shift_dates,
    offset_by_location=day_location_reached_1_per_100k,
    axis='columns'
)

print('columns available:')
display(cleaned_timeseries)

columns available:


Unnamed: 0,name,level,city,county,state,country_code,population,lat,long,url,aggregate,tz,cases,deaths,recovered,active,tested,growthFactor,date,country,location,deaths/cases,cases per 100k capita,1d new recovered,1d new deaths,1d new cases,1d new active,3d new recovered,3d new deaths,3d new cases,3d new active,7d new recovered,7d new deaths,7d new cases,7d new active,Date,days since 100 cases,days since 100 active,days since 1 case/100k people
0,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,2.0,,,2.0,,,2020-01-22,Thailand,Thailand,,0.002923,,,,,,,,,,,,,2020-01-22,-53.0,-54.0,-62.0
1,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,3.0,,,3.0,,1.500000,2020-01-23,Thailand,Thailand,,0.004385,,,1.0,1.0,,,,,,,,,2020-01-23,-52.0,-53.0,-61.0
2,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,5.0,,,5.0,,1.666667,2020-01-24,Thailand,Thailand,,0.007308,,,2.0,2.0,,,,,,,,,2020-01-24,-51.0,-52.0,-60.0
3,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,7.0,,,7.0,,1.400000,2020-01-25,Thailand,Thailand,,0.010232,,,2.0,2.0,,,5.0,5.0,,,,,2020-01-25,-50.0,-51.0,-59.0
4,THA,country,,,,THA,68414135.0,13.040833,101.544556,https://github.com/CSSEGISandData/COVID-19,country,Asia/Bangkok,8.0,,2.0,6.0,,1.142857,2020-01-26,Thailand,Thailand,,0.011693,,,1.0,-1.0,,,5.0,3.0,,,,,2020-01-26,-49.0,-50.0,-58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54325,"CWhyearnodkoeettCeoCuonutnyty, iso2:US-KS, iso...",county,,CWhyearnodkoeettCeoCuonutnyty,iso2:US-KS,iso1:US,,,,https://public.tableau.com/views/COVID-19Data_...,county,,1549.0,,,,,1.000000,2020-04-05,,"CWhyearnodkoeettCeoCuonutnyty, iso2:US-KS",,,,,0.0,,,,,,,,,,2020-04-05,1.0,,
54326,"St Clair County, iso2:US-MI, iso1:US",county,,St Clair County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,66.0,,,,,,2020-04-04,,"St Clair County, iso2:US-MI",,,,,,,,,,,,,,,2020-04-04,,,
54327,"St Clair County, iso2:US-MI, iso1:US",county,,St Clair County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,66.0,,,,,1.000000,2020-04-05,,"St Clair County, iso2:US-MI",,,,,0.0,,,,,,,,,,2020-04-05,,,
54328,"St Joseph County, iso2:US-MI, iso1:US",county,,St Joseph County,iso2:US-MI,iso1:US,,,,"https://www.michigan.gov/coronavirus/0,9753,7-...",county,,10.0,,,,,,2020-04-04,,"St Joseph County, iso2:US-MI",,,,,,,,,,,,,,,2020-04-04,,,


## There are a lot of places. Let's filter them down to just a few

In [0]:
from IPython.display import HTML
list_content = '</li><li>'.join(cleaned_timeseries.location.unique())
HTML(
    f'<ul style="max-height: 30em; overflow-y: scroll"><li>{list_content}</li></ul>'
)

In [0]:
locations_of_interest = {
    'China',
    'Beijing, China',
    'Hubei, China',
    'Whatcom County, WA, USA',
    'Island County, WA, USA',
    'King County, WA, USA',
    # 'Pierce County, WA, USA',
    'NY, USA',
    'WA, USA',
    'USA',
    'San Francisco County, CA, USA',
    'UK',
    'Italy',
    'Germany',
    'Singapore',
    'Sweden',
    'South Korea',
}
bad_locations = locations_of_interest.difference(cleaned_timeseries.location)
if bad_locations:
    raise Exception(f'Bad locations: {bad_locations}')

data_for_locations_of_interest = cleaned_timeseries[cleaned_timeseries.location.isin(locations_of_interest)]

Exception: ignored

# Plotting

## Let's get down to business!

Starting with the basics: confirmed cases over time.

In [0]:
display(
    px.line(
        data_for_locations_of_interest,
        title='Cases over time',
        x='date',
        y='cases',
        log_y=True,
        color='location',
    )
)
display(
    px.line(
        data_for_locations_of_interest,
        title='Active cases over time',
        x='date',
        y='active',
        log_y=True,
        color='location',
    )
)

## What about deaths over time?

In [0]:
display(
    px.line(
        data_for_locations_of_interest,
        title='Deaths over time',
        x='date',
        y='deaths',
        log_y=True,
        color='location'
    )
)

## Interesting...
that's a similar pattern to the number of confirmed cases over time. Is there a linear relationship? Let's see.

In [0]:
display(
    px.line(
        data_for_locations_of_interest,
        title='Deaths per confirmed case over time',
        x='date',
        y='deaths/cases',
        hover_data=['deaths', 'cases'],
        # log_y=True,
        color='location'
    )
)

That's surprisingly consistent for a particular location. Let's look at that differently.

In [0]:
display(
    px.box(
        data_for_locations_of_interest,
        title='Deaths/cases boxplot',
        x='location',
        y='deaths/cases',
    )
)
display(
    px.line(
        data_for_locations_of_interest,
        title='Deaths/cases by location over time',
        facet_col='location',
        x='date',
        y='deaths/cases',
    )
)
display(
    px.line(
        data_for_locations_of_interest,
        title='Cases vs. deaths by location',
        color='location',
        x='cases',
        y='deaths',
        log_x=True,
        log_y=True,
    )
)
display(
    px.line(
        data_for_locations_of_interest,
        title='Active cases vs. 3d deaths by location',
        color='location',
        x='active',
        y='3d new deaths',
        hover_data=['Date'],
        log_x=True,
        log_y=True,
    )
)

In [0]:
display(
    px.line(
        data_for_locations_of_interest[~pd.np.isnan(data_for_locations_of_interest['days since 100 cases'])],
        title='Cases over time<br><i>log Y axis. X axis shifted to match 1st day each location exceeded 100 cases</i>',
        x='days since 100 cases',
        y='cases',
        hover_data=['Date'],
        log_y=True,
        color='location'
    )
)
display(
    px.line(
        data_for_locations_of_interest[~pd.np.isnan(data_for_locations_of_interest['days since 100 cases'])],
        title='Active (unresolved) cases over time<br><i>log Y axis. X axis shifted to match 1st day each location exceeded 100 active cases</i>',
        x='days since 100 active',
        y='active',
        hover_data=['Date'],
        log_y=True,
        color='location'
    )
)

In [0]:
display(
    px.line(
        data_for_locations_of_interest[~pd.np.isnan(data_for_locations_of_interest['cases per 100k capita'])],
        title='Cases per capita over time<br><i>log Y axis. X axis shifted to match 1st day each location exceeded 1 case / 100k capita</i>',
        x='days since 1 case/100k people',
        y='cases per 100k capita',
        hover_data=['cases', 'Date'],
        log_y=True,
        color='location',
    )
)

# New active cases

This might be a nice clear way of looking at where the disease spread is being effectively prevented. Are we "bending the curve"?

In [0]:
num_days = 1
y_column = f'{num_days}d new active'
display(
    px.line(
        data_for_locations_of_interest,
        # data_for_locations_of_interest[~pd.np.isnan(data_for_locations_of_interest['days since 100 cases'])],
        title=f'{num_days}d change in # of active cases',
        # x='days since 100 cases',
        x='date',
        y=y_column,
        hover_data=['cases', 'Date'],
        # log_y=True,
        color='location',
    )
)