# Retrieve, reformat, clean and store data of the Covid-19 pandemic

Data sources:
* John Hopkins University (**JHU**) - Center for System Science and Engineering (CSSE)
* Berliner Morgenpost (**BMP**)

In [None]:
%reset -f
%matplotlib inline

from collections import namedtuple, OrderedDict
from datetime import datetime, timedelta
from pathlib import Path
import unittest

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.rcParams['figure.figsize'] = (15, 8)
pd.options.display.max_rows = 8
sns.set()

tc = unittest.TestCase('__init__')

## Retrieving JHU data
* Dashboard: https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6
* Data: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data

Data is available as git repo, thus transfer is compressed. It is assumed that the `COVID-19` repo is `git clone`'d at the same directory level as this `covid-19_analysis`.
```
cd ..
git clone  https://github.com/CSSEGISandData/COVID-19.git
```

In [None]:
covid_19_jhu_repo = Path('../COVID-19')

my_pwd = %pwd
%cd -q {covid_19_jhu_repo}
!git pull
%cd -q {my_pwd}

In [None]:
filename_confirmed_global_JHU = (covid_19_jhu_repo /
  "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
assert filename_confirmed_global_JHU.exists()

confirmed_global_JHU = pd.read_csv(filename_confirmed_global_JHU)
print(confirmed_global_JHU.keys()[:5])
confirmed_global_JHU.set_index("Country/Region", inplace=True)
confirmed_global_JHU.head(3)

## Reformat JHU data

We are interested in the contries:
* Austria
* Germany
* France

We want the table have the data as coloumn (thus transpose it).

And the France data is scattered over the departments, thus we need to collect it.

In [None]:
COUNTRIES_SELECTED = ["Germany", "Austria", "France"]

date_columns = [c for c in confirmed_global_JHU.columns if c.endswith('/20')]
cases_JHU = (confirmed_global_JHU
    .loc[confirmed_global_JHU.index.isin(COUNTRIES_SELECTED), date_columns]
    .transpose()
    .reset_index()
    .rename(columns={'index': 'date'})
)
cases_JHU['date'] = pd.to_datetime(cases_JHU['date'])
cases_JHU.set_index('date', inplace=True)
print(cases_JHU.tail(1))

# there are multiple France, let's sum them to one new France2
cases_JHU['France2'] = cases_JHU['France'].sum(axis=1)
del cases_JHU['France']
_rd = OrderedDict([
    ('Germany', 'Germany_JHU'),
    ('Austria', 'Austria_JHU'),
    ('France2', 'France_JHU'),
])
cases_JHU.rename(columns=_rd, inplace=True)
cases_JHU = cases_JHU.reindex(_rd.values(), axis=1)
cases_JHU.tail(4)

## Plotting JHU data

In [None]:
#cases_JHU.plot(style='o-', logy=True)

## Retrieving BMP data

Total and recovered data are taken manually from the following URL:
* https://interaktiv.morgenpost.de/corona-virus-karte-infektionen-deutschland-weltweit/


In [None]:
filename_BMP = Path("data_raw_BMP.csv")
cases_BMP = pd.read_csv(filename_BMP, index_col=[0], parse_dates=[0])
cases_BMP.tail(3)

In [None]:
try:
    tc.assertEqual(cases_BMP.index[-1], cases_JHU.index[-1])
except AssertionError as e:
    print("ERROR: manually add the missing data to %s" % filename_BMP)
    raise e

## Calculating infected
    
$$I_t = I_{t-1} + \text{new cases}_t - \text{new recoveries}_t - \text{new death}_t$$

In [None]:
mask = cases_BMP['confirmed'] >= 100
c_rd = cases_BMP[['confirmed', 'recovered_alive', 'death']].diff().loc[mask]

N = len(c_rd)
infected = np.zeros(N, dtype='f4')
for i in range(N):
    # NOTE: due to python wrap around and zero init infected[-1] = 0
    infected[i] = infected[i - 1] + c_rd['confirmed'].iloc[i] \
                  - c_rd['recovered_alive'].iloc[i] - c_rd['death'].iloc[i]

cases_BMP['infected'] = np.nan
cases_BMP.loc[mask, 'infected'] = infected

cases_BMP['recovered'] = cases_BMP[['recovered_alive', 'death']].sum(axis=1)

## Plotting BMP data

In [None]:
_kwargs = dict(style='o-', logy=True, legend=True)
ax = cases_BMP.loc[mask, ['confirmed', 'infected']].plot(**_kwargs)
for key in c_rd.keys():
    c_rd[key].plot(ax=ax, label=key + ' delta', **_kwargs)

## Join JHU and BMP data, plot and write to file

In [None]:
df = cases_JHU.join(
    cases_BMP[['infected', 'recovered']].rename(
        columns={'infected': 'Germany_infected_BMP',
                  'recovered': 'Germany_recovered_BMP'}
    )
)
df.tail(3)

In [None]:
#df.plot(style='o-', logy=True, alpha=0.5)

In [None]:
df.to_csv('cases.csv', line_terminator="\r\n")