In [1]:
%matplotlib qt
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as m_date
import datetime as dt

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Read the Data
Thanks to https://github.com/pomber/covid19 for preparing the data.

In [3]:
covid_19 = pd.read_json("https://pomber.github.io/covid19/timeseries.json")

## Transform the Data
What we need for better work is a DataFrame with (date, country, confirmed, deaths, recoverd) columns. This is what the json_normalize function do.
And we extract the first date for a country where the confirmed cases has a given value.

In [5]:
covid_df = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
covid_threshold = pd.DataFrame(columns = ['date', 'confirmed', 'deaths', 'recovered', 'country'])
threshold=1000
for country in covid_19.columns:
    # normalize, set country, convert date
    cdf = pd.json_normalize(covid_19[country])
    cdf['country'] = str(country)
    cdf['date'] = pd.to_datetime(cdf['date'])
    covid_df = pd.concat([covid_df, cdf])
        
    # all indices for confirmed greater than and add to covid_threshold
    index_values = cdf[cdf['confirmed'].gt(threshold)].index
    if len(index_values) > 0:
        covid_threshold = pd.concat([covid_threshold, cdf.iloc[[index_values[0]]]])

## Show the Spreading over the world

In [6]:
date_only=covid_threshold[['date','country']].copy()
cnt = date_only.groupby(date_only["date"]).count()
cnt['cum_count'] = cnt['country'].cumsum()

#plot data
fig, ax = plt.subplots(figsize=(15,7))
ax.bar(cnt.index, cnt['cum_count'])

ax.set_title("Number of Countries by First Date Over {} Confirmed Cases".format(threshold))
#set ticks every week
ax.xaxis.set_major_locator(m_date.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(m_date.DateFormatter('%b %d'))

In [6]:
date_only.sort_values(by='date', ascending=False)

Unnamed: 0,date,country
775,2022-03-07,Tonga
744,2022-02-04,Kiribati
740,2022-01-31,Solomon Islands
736,2022-01-27,Palau
591,2021-09-04,Grenada
...,...,...
46,2020-03-08,Germany
40,2020-03-02,Iran
38,2020-02-29,Italy
35,2020-02-26,"Korea, South"


In [7]:
def get_country_data(country_name):
    return covid_df[covid_df['country'] == country_name]

## Compare Countries

In [8]:
countries=['Germany', "Austria"]
x_axis = pd.date_range(start='2019-01-01', end=dt.datetime.now().strftime('%Y-%m-%d'), freq='W')
y_property= 'deaths' # properties deaths, confirmed

_, axes = plt.subplots(len(countries), 2, figsize=(15,7))
yposition = 0
for c in countries:
    df = pd.DataFrame(get_country_data(c))

    diff_property = f"diff_{y_property}"
    df[diff_property] = df[y_property].diff()
    df['rolling_mean_week'] = df[y_property].rolling(7).mean()
    df['rolling_mean_diff_week'] = df[diff_property].rolling(7).mean()
    showLegend = True if yposition == 0 else False
    
    df[['date', y_property]].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_week']].plot(ax=axes[yposition, 0], sharex=True, x='date', legend=showLegend)

    df[['date',diff_property]].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    df[['date','rolling_mean_diff_week']].plot(ax=axes[yposition, 1], sharex=True, x='date', legend=showLegend)
    
    for xc in x_axis:
        axes[yposition, 0].axvline(xc, color='grey', linestyle=':')
        axes[yposition, 1].axvline(xc, color='grey', linestyle=':')
    
    axes[yposition, 0].set_title(c)
    yposition += 1
    
plt.show()

## Adding Population and Percentage View

In [10]:
# downloaded from http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv
population = pd.read_csv("../data/API_SP.POP.TOTL_DS2_en_csv_v2_2252106.csv", skiprows=4, header=0)

In [11]:
def get_population(country_name, y):
    rv = population.loc[population['Country Name'] == country_name][y]
    rv = 0 if rv.empty else float(rv)
    return rv

In [12]:
def get_percent(h,v):
    return (100 * v)/h if h else 0

In [13]:
covid_df['pop_2018'] = covid_df.apply (lambda row: get_population(row['country'], '2018'), axis=1)

In [14]:
covid_df.head()

Unnamed: 0,date,confirmed,deaths,recovered,country,pop_2018
0,2020-01-22,0,0,0,Afghanistan,37172386.0
1,2020-01-23,0,0,0,Afghanistan,37172386.0
2,2020-01-24,0,0,0,Afghanistan,37172386.0
3,2020-01-25,0,0,0,Afghanistan,37172386.0
4,2020-01-26,0,0,0,Afghanistan,37172386.0


In [15]:
covid_df.groupby(['country'], as_index=False, sort=False)[['date']].max()

Unnamed: 0,country,date
0,Afghanistan,2022-03-08
1,Albania,2022-03-08
2,Algeria,2022-03-08
3,Andorra,2022-03-08
4,Angola,2022-03-08
...,...,...
193,West Bank and Gaza,2022-03-08
194,Winter Olympics 2022,2022-03-08
195,Yemen,2022-03-08
196,Zambia,2022-03-08


In [16]:
# select the data for the last existing date in the dataframe
country_df = covid_df.groupby('country', as_index=False).apply(lambda x: x.loc[x.date.argmax()])

### compare confirmed vs. deaths

In [17]:
country_df['percent_deaths'] = country_df.apply (lambda x: get_percent(x.confirmed, x.deaths), axis=1)
country_df = country_df[country_df['percent_deaths'] > 3.00]
country_df = country_df.dropna()

In [18]:
country_df_sort = country_df.sort_values(by=['percent_deaths'], ascending=True)

In [19]:
country_df_sort.plot.barh(x='country', y='percent_deaths')

<AxesSubplot:ylabel='country'>

In [20]:
country_df

776,date,confirmed,deaths,recovered,country,pop_2018,percent_deaths
0,2022-03-08,175353,7630,0,Afghanistan,37172386.0,4.351223
22,2022-03-08,372645,15590,0,Bosnia and Herzegovina,3323929.0,4.183606
26,2022-03-08,1105968,35964,0,Bulgaria,7025037.0,3.251812
28,2022-03-08,601020,19394,0,Burma,0.0,3.226848
53,2022-03-08,841575,35309,0,Ecuador,17084357.0,4.195586
54,2022-03-08,492774,24244,0,Egypt,0.0,4.919902
65,2022-03-08,11963,365,0,Gambia,0.0,3.051074
101,2022-03-08,7384,294,0,Liberia,4818977.0,3.981582
106,2022-03-08,9,2,0,MS Zaandam,0.0,22.222222
108,2022-03-08,85452,2619,0,Malawi,18143315.0,3.064879


In [22]:
country_df.to_csv('../data/covid-death-confirmed.csv', index=False)