In [24]:
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
%matplotlib qt

## Read the Data
Thanks to https://github.com/pomber/covid19 for preparing the data.

In [25]:
covid_19 = pd.read_json("https://pomber.github.io/covid19/timeseries.json")

## Transform the Data
What we need for better work is a DataFrame with (date, country, confirmed, deaths, recoverd) columns. This is what the json_normalize function do.
And we extract the first date for a country where the confirmed cases has a given value.

In [26]:
covid_df = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
covid_threshold = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
treshold=1000
for country in covid_19.columns:
    # normalize, set country, convert date
    cdf = pd.json_normalize(covid_19[country])
    cdf['country'] = str(country)
    cdf['date'] = pd.to_datetime(cdf['date'])
    covid_df = pd.concat([covid_df, cdf])
        
    # all indices for confirmed greater than and add to covid_threshold
    index_values = cdf[cdf['confirmed'].gt(treshold)].index
    if len(index_values) > 0:
        covid_threshold = pd.concat([covid_threshold, cdf.iloc[[index_values[0]]]])

## Show the Spreading over the world

In [42]:
date_only=covid_threshold[['date','country']].copy()
cnt = date_only.groupby(date_only["date"]).count()
cnt['cum_count'] = cnt['country'].cumsum()

#plot data
fig, ax = plt.subplots(figsize=(15,7))
ax.bar(cnt.index, cnt['cum_count'])

ax.set_title("Number of Countries by First Date Over {} Confirmed Cases".format(treshold))
#set ticks every week
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))

In [28]:
def get_country_data(country):
    return covid_df[covid_df['country'] == country]

## Compare Countries

In [96]:
countries=['US', 'China', 'Germany']
xcoords = pd.date_range(start='2020-01-17', end=dt.datetime.now().strftime('%Y-%m-%d'), freq='W')

fig, axes = plt.subplots(len(countries), 2, figsize=(15,7))
yposition = 0
for c in countries:
    df = pd.DataFrame(get_country_data(c))
    df['diff_confirmed'] = df['confirmed'].diff()
    df['rolling_mean_week'] = df['confirmed'].rolling(7).mean()
    df['rolling_mean_diff_week'] = df['diff_confirmed'].rolling(7).mean()
    showLegend = True if yposition == 0 else False
    
    df[['date','confirmed']].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_week']].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)

    df[['date','diff_confirmed']].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_diff_week']].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    
    for xc in xcoords:
        axes[yposition, 0].axvline(xc, color='grey', linestyle=':')
        axes[yposition, 1].axvline(xc, color='grey', linestyle=':')
    
    axes[yposition, 0].set_title(c)
    yposition += 1
    
plt.show()

## Adding Population and Percentual View

In [30]:
# downloaded from http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
population = pd.read_csv("data/API_SP.POP.TOTL_DS2_en_csv_v2_887275.csv", skiprows=4, header=0)

In [31]:
def get_population(p, c, y):
    rv = population.loc[population['Country Name']==c][y]
    rv = 0 if rv.empty else float(rv)
    return rv

In [32]:
def get_percent(h,v):
    return (100 * v)/h if h else 0

In [33]:
covid_df['pop_2018'] = covid_df.apply (lambda row: get_population(population, row['country'], '2018'), axis=1)

In [34]:
covid_df.head()

Unnamed: 0,date,confirmed,deaths,recovered,country,pop_2018
0,2020-01-22,0,0,0,Afghanistan,37172386.0
1,2020-01-23,0,0,0,Afghanistan,37172386.0
2,2020-01-24,0,0,0,Afghanistan,37172386.0
3,2020-01-25,0,0,0,Afghanistan,37172386.0
4,2020-01-26,0,0,0,Afghanistan,37172386.0


In [35]:
covid_df.groupby(['country'], as_index=False, sort=False)[['date']].max()

Unnamed: 0,country,date
0,Afghanistan,2020-04-11
1,Albania,2020-04-11
2,Algeria,2020-04-11
3,Andorra,2020-04-11
4,Angola,2020-04-11
...,...,...
180,Malawi,2020-04-11
181,South Sudan,2020-04-11
182,Western Sahara,2020-04-11
183,Sao Tome and Principe,2020-04-11


In [36]:
# select the data for the last existing date in the dataframe
country_df = covid_df.groupby('country', as_index=False).apply(lambda x: x.loc[x.date.argmax()])

### compare confirmed vs. deaths

In [37]:
country_df['percent_deaths'] = country_df.apply (lambda x: get_percent(x.confirmed, x.deaths), axis=1)
## country_df = country_df[country_df['percent_deaths'] > 0.0001]
country_df = country_df.dropna()

In [38]:
country_df_sort = country_df.sort_values(by=['percent_deaths'], ascending=True)

In [39]:
country_df_sort.plot.barh(x='country', y='percent_deaths')

<matplotlib.axes._subplots.AxesSubplot at 0x1e8637b8548>

In [40]:
country_df

80,date,confirmed,deaths,recovered,country,pop_2018,percent_deaths
0,2020-04-11,555,18,32,Afghanistan,37172386.0,3.243243
1,2020-04-11,433,23,197,Albania,2866376.0,5.311778
2,2020-04-11,1825,275,460,Algeria,42228429.0,15.068493
3,2020-04-11,601,26,71,Andorra,77006.0,4.326123
4,2020-04-11,19,2,4,Angola,30809762.0,10.526316
...,...,...,...,...,...,...,...
180,2020-04-11,268,2,57,West Bank and Gaza,4569087.0,0.746269
181,2020-04-11,4,0,0,Western Sahara,0.0,0.000000
182,2020-04-11,1,0,0,Yemen,0.0,0.000000
183,2020-04-11,40,2,28,Zambia,17351822.0,5.000000


In [41]:
country_df.to_csv('data/covid-death-confirmed.csv', index=False)

In [55]:
for g in covid_df.groupby('country'):
    print(type(g), g)

<class 'tuple'> ('Afghanistan',          date confirmed deaths recovered      country    pop_2018
0  2020-01-22         0      0         0  Afghanistan  37172386.0
1  2020-01-23         0      0         0  Afghanistan  37172386.0
2  2020-01-24         0      0         0  Afghanistan  37172386.0
3  2020-01-25         0      0         0  Afghanistan  37172386.0
4  2020-01-26         0      0         0  Afghanistan  37172386.0
..        ...       ...    ...       ...          ...         ...
76 2020-04-07       423     14        18  Afghanistan  37172386.0
77 2020-04-08       444     14        29  Afghanistan  37172386.0
78 2020-04-09       484     15        32  Afghanistan  37172386.0
79 2020-04-10       521     15        32  Afghanistan  37172386.0
80 2020-04-11       555     18        32  Afghanistan  37172386.0

[81 rows x 6 columns])
<class 'tuple'> ('Albania',          date confirmed deaths recovered  country   pop_2018
0  2020-01-22         0      0         0  Albania  2866376.0
1 

<class 'tuple'> ('China',          date confirmed deaths recovered country      pop_2018
0  2020-01-22       548     17        28   China  1.392730e+09
1  2020-01-23       643     18        30   China  1.392730e+09
2  2020-01-24       920     26        36   China  1.392730e+09
3  2020-01-25      1406     42        39   China  1.392730e+09
4  2020-01-26      2075     56        49   China  1.392730e+09
..        ...       ...    ...       ...     ...           ...
76 2020-04-07     82718   3335     77410   China  1.392730e+09
77 2020-04-08     82809   3337     77567   China  1.392730e+09
78 2020-04-09     82883   3339     77679   China  1.392730e+09
79 2020-04-10     82941   3340     77791   China  1.392730e+09
80 2020-04-11     83014   3343     77877   China  1.392730e+09

[81 rows x 6 columns])
<class 'tuple'> ('Colombia',          date confirmed deaths recovered   country    pop_2018
0  2020-01-22         0      0         0  Colombia  49648685.0
1  2020-01-23         0      0         

[81 rows x 6 columns])
<class 'tuple'> ('India',          date confirmed deaths recovered country      pop_2018
0  2020-01-22         0      0         0   India  1.352617e+09
1  2020-01-23         0      0         0   India  1.352617e+09
2  2020-01-24         0      0         0   India  1.352617e+09
3  2020-01-25         0      0         0   India  1.352617e+09
4  2020-01-26         0      0         0   India  1.352617e+09
..        ...       ...    ...       ...     ...           ...
76 2020-04-07      5311    150       421   India  1.352617e+09
77 2020-04-08      5916    178       506   India  1.352617e+09
78 2020-04-09      6725    226       620   India  1.352617e+09
79 2020-04-10      7598    246       774   India  1.352617e+09
80 2020-04-11      8446    288       969   India  1.352617e+09

[81 rows x 6 columns])
<class 'tuple'> ('Indonesia',          date confirmed deaths recovered    country     pop_2018
0  2020-01-22         0      0         0  Indonesia  267663435.0
1  2020-01-

[81 rows x 6 columns])
<class 'tuple'> ('Mozambique',          date confirmed deaths recovered     country    pop_2018
0  2020-01-22         0      0         0  Mozambique  29495962.0
1  2020-01-23         0      0         0  Mozambique  29495962.0
2  2020-01-24         0      0         0  Mozambique  29495962.0
3  2020-01-25         0      0         0  Mozambique  29495962.0
4  2020-01-26         0      0         0  Mozambique  29495962.0
..        ...       ...    ...       ...         ...         ...
76 2020-04-07        10      0         1  Mozambique  29495962.0
77 2020-04-08        17      0         1  Mozambique  29495962.0
78 2020-04-09        17      0         1  Mozambique  29495962.0
79 2020-04-10        20      0         2  Mozambique  29495962.0
80 2020-04-11        20      0         2  Mozambique  29495962.0

[81 rows x 6 columns])
<class 'tuple'> ('Namibia',          date confirmed deaths recovered  country   pop_2018
0  2020-01-22         0      0         0  Namibia  24

<class 'tuple'> ('Slovenia',          date confirmed deaths recovered   country   pop_2018
0  2020-01-22         0      0         0  Slovenia  2067372.0
1  2020-01-23         0      0         0  Slovenia  2067372.0
2  2020-01-24         0      0         0  Slovenia  2067372.0
3  2020-01-25         0      0         0  Slovenia  2067372.0
4  2020-01-26         0      0         0  Slovenia  2067372.0
..        ...       ...    ...       ...       ...        ...
76 2020-04-07      1059     36       102  Slovenia  2067372.0
77 2020-04-08      1091     40       120  Slovenia  2067372.0
78 2020-04-09      1124     43       128  Slovenia  2067372.0
79 2020-04-10      1160     45       137  Slovenia  2067372.0
80 2020-04-11      1188     50       148  Slovenia  2067372.0

[81 rows x 6 columns])
<class 'tuple'> ('Somalia',          date confirmed deaths recovered  country    pop_2018
0  2020-01-22         0      0         0  Somalia  15008154.0
1  2020-01-23         0      0         0  Somalia  

[81 rows x 6 columns])
<class 'tuple'> ('Vietnam',          date confirmed deaths recovered  country    pop_2018
0  2020-01-22         0      0         0  Vietnam  95540395.0
1  2020-01-23         2      0         0  Vietnam  95540395.0
2  2020-01-24         2      0         0  Vietnam  95540395.0
3  2020-01-25         2      0         0  Vietnam  95540395.0
4  2020-01-26         2      0         0  Vietnam  95540395.0
..        ...       ...    ...       ...      ...         ...
76 2020-04-07       249      0       123  Vietnam  95540395.0
77 2020-04-08       251      0       126  Vietnam  95540395.0
78 2020-04-09       255      0       128  Vietnam  95540395.0
79 2020-04-10       257      0       144  Vietnam  95540395.0
80 2020-04-11       258      0       144  Vietnam  95540395.0

[81 rows x 6 columns])
<class 'tuple'> ('West Bank and Gaza',          date confirmed deaths recovered             country   pop_2018
0  2020-01-22         0      0         0  West Bank and Gaza  4569087.