In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
%matplotlib qt

## Read the Data
Thanks to https://github.com/pomber/covid19 for preparing the data.

In [2]:
covid_19 = pd.read_json("https://pomber.github.io/covid19/timeseries.json")

## Transform the Data
What we need for better work is a DataFrame with (date, country, confirmed, deaths, recoverd) columns. This is what the json_normalize function do.
And we extract the first date for a country where the confirmed cases has a given value.

In [3]:
covid_df = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
covid_threshold = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
treshold=1000
for country in covid_19.columns:
    # normalize, set country, convert date
    cdf = pd.json_normalize(covid_19[country])
    cdf['country'] = str(country)
    cdf['date'] = pd.to_datetime(cdf['date'])
    covid_df = pd.concat([covid_df, cdf])
        
    # all indices for confirmed greater than and add to covid_threshold
    index_values = cdf[cdf['confirmed'].gt(treshold)].index
    if len(index_values) > 0:
        covid_threshold = pd.concat([covid_threshold, cdf.iloc[[index_values[0]]]])

## Show the Spreading over the world

In [4]:
date_only=covid_threshold[['date','country']].copy()
cnt = date_only.groupby(date_only["date"]).count()
cnt['cum_count'] = cnt['country'].cumsum()

#plot data
fig, ax = plt.subplots(figsize=(15,7))
ax.bar(cnt.index, cnt['cum_count'])

ax.set_title("Number of Countries by First Date Over {} Confirmed Cases".format(treshold))
#set ticks every week
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))

In [5]:
def get_country_data(country):
    return covid_df[covid_df['country'] == country]

## Compare Countries

In [21]:
countries=['US', 'New Zealand', 'Germany']
xcoords = pd.date_range(start='2020-01-17', end=dt.datetime.now().strftime('%Y-%m-%d'), freq='W')

fig, axes = plt.subplots(len(countries), 2, figsize=(15,7))
yposition = 0
for c in countries:
    df = pd.DataFrame(get_country_data(c))
    df['diff_confirmed'] = df['confirmed'].diff()
    df['rolling_mean_week'] = df['confirmed'].rolling(7).mean()
    df['rolling_mean_diff_week'] = df['diff_confirmed'].rolling(7).mean()
    showLegend = True if yposition == 0 else False
    
    df[['date','confirmed']].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_week']].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)

    df[['date','diff_confirmed']].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_diff_week']].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    
    for xc in xcoords:
        axes[yposition, 0].axvline(xc, color='grey', linestyle=':')
        axes[yposition, 1].axvline(xc, color='grey', linestyle=':')
    
    axes[yposition, 0].set_title(c)
    yposition += 1
    
plt.show()

## Adding Population and Percentual View

In [23]:
# downloaded from http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
population = pd.read_csv("data/API_SP.POP.TOTL_DS2_en_csv_v2_1308146.csv", skiprows=4, header=0)

In [24]:
def get_population(p, c, y):
    rv = population.loc[population['Country Name']==c][y]
    rv = 0 if rv.empty else float(rv)
    return rv

In [25]:
def get_percent(h,v):
    return (100 * v)/h if h else 0

In [26]:
covid_df['pop_2018'] = covid_df.apply (lambda row: get_population(population, row['country'], '2018'), axis=1)

In [27]:
covid_df.head()

Unnamed: 0,date,confirmed,deaths,recovered,country,pop_2018
0,2020-01-22,0,0,0,Afghanistan,37172386.0
1,2020-01-23,0,0,0,Afghanistan,37172386.0
2,2020-01-24,0,0,0,Afghanistan,37172386.0
3,2020-01-25,0,0,0,Afghanistan,37172386.0
4,2020-01-26,0,0,0,Afghanistan,37172386.0


In [28]:
covid_df.groupby(['country'], as_index=False, sort=False)[['date']].max()

Unnamed: 0,country,date
0,Afghanistan,2020-08-13
1,Albania,2020-08-13
2,Algeria,2020-08-13
3,Andorra,2020-08-13
4,Angola,2020-08-13
...,...,...
183,West Bank and Gaza,2020-08-13
184,Western Sahara,2020-08-13
185,Yemen,2020-08-13
186,Zambia,2020-08-13


In [29]:
# select the data for the last existing date in the dataframe
country_df = covid_df.groupby('country', as_index=False).apply(lambda x: x.loc[x.date.argmax()])

### compare confirmed vs. deaths

In [30]:
country_df['percent_deaths'] = country_df.apply (lambda x: get_percent(x.confirmed, x.deaths), axis=1)
## country_df = country_df[country_df['percent_deaths'] > 0.0001]
country_df = country_df.dropna()

In [31]:
country_df_sort = country_df.sort_values(by=['percent_deaths'], ascending=True)

In [32]:
country_df_sort.plot.barh(x='country', y='percent_deaths')

<matplotlib.axes._subplots.AxesSubplot at 0x163808d3408>

In [33]:
country_df

204,date,confirmed,deaths,recovered,country,pop_2018,percent_deaths
0,2020-08-13,37424,1363,26714,Afghanistan,37172386.0,3.642048
1,2020-08-13,6971,213,3616,Albania,2866376.0,3.055516
2,2020-08-13,37187,1341,26004,Algeria,42228429.0,3.606099
3,2020-08-13,981,53,858,Andorra,77006.0,5.402650
4,2020-08-13,1815,80,577,Angola,30809762.0,4.407713
...,...,...,...,...,...,...,...
183,2020-08-13,15491,106,9186,West Bank and Gaza,4569087.0,0.684268
184,2020-08-13,10,1,8,Western Sahara,0.0,10.000000
185,2020-08-13,1847,528,949,Yemen,0.0,28.586898
186,2020-08-13,8663,246,7401,Zambia,17351822.0,2.839663


In [34]:
country_df.to_csv('data/covid-death-confirmed.csv', index=False)

In [35]:
for g in covid_df.groupby('country'):
    print(type(g), g)

<class 'tuple'> ('Afghanistan',           date confirmed deaths recovered      country    pop_2018
0   2020-01-22         0      0         0  Afghanistan  37172386.0
1   2020-01-23         0      0         0  Afghanistan  37172386.0
2   2020-01-24         0      0         0  Afghanistan  37172386.0
3   2020-01-25         0      0         0  Afghanistan  37172386.0
4   2020-01-26         0      0         0  Afghanistan  37172386.0
..         ...       ...    ...       ...          ...         ...
200 2020-08-09     37054   1312     25960  Afghanistan  37172386.0
201 2020-08-10     37162   1328     26228  Afghanistan  37172386.0
202 2020-08-11     37269   1344     26415  Afghanistan  37172386.0
203 2020-08-12     37345   1354     26694  Afghanistan  37172386.0
204 2020-08-13     37424   1363     26714  Afghanistan  37172386.0

[205 rows x 6 columns])
<class 'tuple'> ('Albania',           date confirmed deaths recovered  country   pop_2018
0   2020-01-22         0      0         0  Albani

[205 rows x 6 columns])
<class 'tuple'> ("Cote d'Ivoire",           date confirmed deaths recovered        country    pop_2018
0   2020-01-22         0      0         0  Cote d'Ivoire  25069229.0
1   2020-01-23         0      0         0  Cote d'Ivoire  25069229.0
2   2020-01-24         0      0         0  Cote d'Ivoire  25069229.0
3   2020-01-25         0      0         0  Cote d'Ivoire  25069229.0
4   2020-01-26         0      0         0  Cote d'Ivoire  25069229.0
..         ...       ...    ...       ...            ...         ...
200 2020-08-09     16715    105     12926  Cote d'Ivoire  25069229.0
201 2020-08-10     16798    105     13052  Cote d'Ivoire  25069229.0
202 2020-08-11     16847    105     13321  Cote d'Ivoire  25069229.0
203 2020-08-12     16847    105     13321  Cote d'Ivoire  25069229.0
204 2020-08-13     16889    107     13522  Cote d'Ivoire  25069229.0

[205 rows x 6 columns])
<class 'tuple'> ('Croatia',           date confirmed deaths recovered  country   pop_2018

[205 rows x 6 columns])
<class 'tuple'> ('Ireland',           date confirmed deaths recovered  country   pop_2018
0   2020-01-22         0      0         0  Ireland  4867316.0
1   2020-01-23         0      0         0  Ireland  4867316.0
2   2020-01-24         0      0         0  Ireland  4867316.0
3   2020-01-25         0      0         0  Ireland  4867316.0
4   2020-01-26         0      0         0  Ireland  4867316.0
..         ...       ...    ...       ...      ...        ...
200 2020-08-09     26712   1772     23364  Ireland  4867316.0
201 2020-08-10     26768   1772     23364  Ireland  4867316.0
202 2020-08-11     26801   1773     23364  Ireland  4867316.0
203 2020-08-12     26838   1774     23364  Ireland  4867316.0
204 2020-08-13     26929   1774     23364  Ireland  4867316.0

[205 rows x 6 columns])
<class 'tuple'> ('Israel',           date confirmed deaths recovered country   pop_2018
0   2020-01-22         0      0         0  Israel  8882800.0
1   2020-01-23         0      

[205 rows x 6 columns])
<class 'tuple'> ('North Macedonia',           date confirmed deaths recovered          country   pop_2018
0   2020-01-22         0      0         0  North Macedonia  2082958.0
1   2020-01-23         0      0         0  North Macedonia  2082958.0
2   2020-01-24         0      0         0  North Macedonia  2082958.0
3   2020-01-25         0      0         0  North Macedonia  2082958.0
4   2020-01-26         0      0         0  North Macedonia  2082958.0
..         ...       ...    ...       ...              ...        ...
200 2020-08-09     11839    527      7664  North Macedonia  2082958.0
201 2020-08-10     11942    528      8087  North Macedonia  2082958.0
202 2020-08-11     12083    529      8248  North Macedonia  2082958.0
203 2020-08-12     12217    530      8487  North Macedonia  2082958.0
204 2020-08-13     12357    532      8662  North Macedonia  2082958.0

[205 rows x 6 columns])
<class 'tuple'> ('Norway',           date confirmed deaths recovered countr

[205 rows x 6 columns])
<class 'tuple'> ('Turkey',           date confirmed deaths recovered country    pop_2018
0   2020-01-22         0      0         0  Turkey  82319724.0
1   2020-01-23         0      0         0  Turkey  82319724.0
2   2020-01-24         0      0         0  Turkey  82319724.0
3   2020-01-25         0      0         0  Turkey  82319724.0
4   2020-01-26         0      0         0  Turkey  82319724.0
..         ...       ...    ...       ...     ...         ...
200 2020-08-09    240804   5844    223759  Turkey  82319724.0
201 2020-08-10    241997   5858    224970  Turkey  82319724.0
202 2020-08-11    243180   5873    226155  Turkey  82319724.0
203 2020-08-12    244392   5891    227089  Turkey  82319724.0
204 2020-08-13    245635   5912    228057  Turkey  82319724.0

[205 rows x 6 columns])
<class 'tuple'> ('US',           date confirmed  deaths recovered country  pop_2018
0   2020-01-22         1       0         0      US       0.0
1   2020-01-23         1       0   