### Download data from Github
https://github.com/CSSEGISandData/COVID-19

In [1]:
!rm time_series_19-covid-*.csv
!wget https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv
!wget https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv
!wget https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv

--2020-03-14 18:18:23--  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443...connected.
HTTP request sent, awaiting response...200 OK
Length: 60607 (59K) [text/plain]
Saving to: ‘time_series_19-covid-Recovered.csv’


2020-03-14 18:18:23 (1.60 MB/s) - ‘time_series_19-covid-Recovered.csv’ saved [60607/60607]

--2020-03-14 18:18:23--  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443...connected.
HTTP

In [2]:
import pandas as pd

CONFIRMED = pd.read_csv('time_series_19-covid-Confirmed.csv')
RECOVERED = pd.read_csv('time_series_19-covid-Recovered.csv')
DEATHS = pd.read_csv('time_series_19-covid-Deaths.csv')

DEL_COLS = ['Province/State', 'Country/Region', 'Lat', 'Long']
DATE_RANGE = [col for col in CONFIRMED.columns if col not in DEL_COLS]
print('Latest data found from %s to %s' % (DATE_RANGE[0], DATE_RANGE[-1]))

Latest data found from 1/22/20 to 3/13/20


### Create a map for country ISO codes
Data from https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes

In [3]:
# Convert country names to ISO codes
ISO_CODES = pd.read_csv('country_iso_codes.csv').set_index('ISO 3166')

# Pre-seed the ISO map with problematic labels
COUNTRY_ISO_MAP = {
    'US': 'US',
    'Brunei': 'BN',
    'United Kingdom': 'UK',
    'Cruise Ship': '??',
    'Reunion': 'RE',
    'Taiwan*': 'TW',
    'Russia': 'RU',
    'Cote d\'Ivoire': 'CI',
    'Congo (Kinshasa)': 'CD',
    'Korea, South': 'KR',
}

for country in CONFIRMED['Country/Region'].unique():
    if country in COUNTRY_ISO_MAP:
        continue
    elif country not in ISO_CODES.index:
        raise ValueError('Needs correction: %s' % country)
    else:
        COUNTRY_ISO_MAP[country] = ISO_CODES.loc[country]['ISO 3166-1-2']
        
print('Countries with ISO codes found: %d' % len(COUNTRY_ISO_MAP))

Countries with ISO codes found: 125


### Create a map for region codes (USA, Canada and Australia)

In [4]:
REGION_CODES = pd.read_csv('country_region_codes.csv')

REGION_MAP = {}
for idx, row in REGION_CODES.iterrows():
    REGION_MAP[row['Name']] = row['Code']

### Convert data from Johns Hopkins into time series format

In [5]:
import datetime
import itertools
from pathlib import Path

# Root path of the project
ROOT = Path('..')

# Create constants
REGION_COLUMN = 'Province/State'
COUNTRY_COLUMN = 'Country/Region'
CONFIRMED_EVENT = 'Confirmed'
RECOVERED_EVENT = 'Recovered'
DEATH_EVENT = 'Deaths'

def get_event(df: pd.DataFrame):
    ''' Helper function used to derive event from dataframe instance '''
    if df.equals(CONFIRMED): return CONFIRMED_EVENT
    if df.equals(RECOVERED): return RECOVERED_EVENT
    if df.equals(DEATHS): return DEATH_EVENT
    raise ValueError('Unknown')
    
def parse_date(datestr: str):
    return datetime.datetime.strptime(datestr, '%m/%d/%y')

region_country_pairs = CONFIRMED.fillna('').apply(
    lambda row: (row[REGION_COLUMN], row[COUNTRY_COLUMN]), axis=1).unique()

records = []
for event, df in zip((CONFIRMED_EVENT, RECOVERED_EVENT, DEATH_EVENT), (CONFIRMED, RECOVERED, DEATHS)):
    df = df.copy().fillna('')
    
    # Parse the country
    df[COUNTRY_COLUMN] = df[COUNTRY_COLUMN].apply(lambda value: COUNTRY_ISO_MAP[value])

    # Parse the region
    df[REGION_COLUMN] = df[REGION_COLUMN].apply(lambda region: region.split(', ')[-1])
    df[REGION_COLUMN] = df[REGION_COLUMN].apply(lambda region: REGION_MAP.get(region, region))
    df[REGION_COLUMN] = df[REGION_COLUMN].apply(lambda region: 'DC' if region == 'D.C.' else region)
    
    # Collapse duplicate regions (e.g. counties)
    # NOTE: this breaks lat/lon values
    df = df.groupby(by=['Province/State', 'Country/Region']).sum().reset_index() 

    for idx, row in df.iterrows():        
            
        for date in DATE_RANGE:
            records.append({
                'Date': parse_date(date),
                'Country': row[COUNTRY_COLUMN], 
                'Region': row[REGION_COLUMN],
                'Event': event,
                'Value': row[date]
            })

sort_order = ['Date', 'Country', 'Region', 'Event']
ts = pd.DataFrame.from_records(records).sort_values(sort_order).set_index('Date', drop=True)
ts.to_csv(ROOT / 'time_series.csv')