In [10]:
import pandas as pd
pd.options.display.float_format = '{0:,.0f}'.format
pd.options.display.max_rows = 20
pd.options.display.max_rows = 100

In [2]:
# John Hopkins Source
# https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
# git submodule foreach git pull origin master
source = 'COVID-19/csse_covid_19_data/csse_covid_19_time_series/'
files = {
    'confirmed': 'time_series_19-covid-Confirmed.csv',
    'deaths': 'time_series_19-covid-Deaths.csv',
    'recovered': 'time_series_19-covid-Recovered.csv'
}

In [3]:
# https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
# source = 'data/'
# files = {
#     'confirmed': 'time_series_2019-ncov-Confirmed.csv',
#     'deaths': 'time_series_2019-ncov-Deaths.csv',
#     'recovered': 'time_series_2019-ncov-Recovered.csv'
# }

In [4]:
# load all three dataframes
source_df = {}
for key, file in files.items():
    source_df[key] = pd.read_csv(source+file) \
        .drop(['Lat', 'Long', 'Province/State'], axis=1) \
        .melt(id_vars=['Country/Region']) \
        .groupby(['Country/Region', 'variable']).sum() \
        .reset_index() \
        .set_index(['Country/Region', 'variable'])

In [5]:
# join datasets
df = source_df['confirmed']. \
    join(source_df['deaths'], rsuffix='_deaths'). \
    join(source_df['recovered'], rsuffix='_recovered'). \
    reset_index()

In [6]:
# set column names
df.columns = ['country', 'date_string', 'confirmed', 'deaths', 'recovered']

In [7]:
# parse date
df['date'] = pd.to_datetime(df['date_string'])

In [8]:
# sort by date
df = df.sort_values(by=['date', 'country'])

In [9]:
# add date string
df['date_string'] = df['date'].dt.strftime('%d. %b')

In [21]:
print('Last data for germany:', df[df['country'] == 'Germany']['date'].max())

Last data for germany: 2020-03-20 00:00:00
