In [None]:
%matplotlib inline  

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from IPython.display import HTML

#  set the path to the ffmpeg utility
import os
if os.name == 'nt':
    plt.rcParams['animation.ffmpeg_path'] = 'C:\\Users\\pjsca\\Documents\\ffmpeg-20200403-52523b6-win64-static\\bin\\ffmpeg.exe'

# Initializations

In [None]:
# prepare styles for the plots
colors = ['r', 'g', 'b', 'y', 'c', 'k', 'm']
styles = ['-', '--', '-.', ':', '_-']

styles_colors = [c + s for s in styles for c in colors]

## Load Data

Load data from the known repos

In [None]:
# COVID confirmed cases
url_confirmed= 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
df_confirmed = pd.read_csv(url_confirmed, header=0)

df_confirmed.head()
    

In [None]:
# COVID associated death
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
df_death = pd.read_csv(url_deaths, header=0)

df_death.head()

In [None]:
from io import StringIO

# World population
url_world_pop = 'https://www.worldometers.info/world-population/population-by-country/'

r = requests.get(url_world_pop)
soup = BeautifulSoup(r.content)
table = str(soup.find_all('table')[0])

df_world_pop = pd.read_html(StringIO(table))[0]

df_world_pop.set_index('Country (or dependency)', inplace=True)

df_world_pop.head()

# First look at data
Let us have a look at data

In [None]:
df_confirmed

In [None]:
df_death

In [None]:
df_world_pop.head()

# Rearranging data 

## grouping data by countries instead of provinces/states
As we can see, some countries have their data divided by Province/states. Let us group all by country (we'll group columns with death numbers, not lat, long, ...)

First, put aside, in a another dataframe, lat and long.

In [None]:
df_lat_long = df_confirmed[['Country/Region', 'Lat', 'Long']]
df_confirmed = df_confirmed.drop(labels=['Province/State', 'Lat', 'Long'], axis=1)

set the lat and long to be the mean value between the known lat and long when grouping

In [None]:
df_lat_long = df_lat_long.groupby('Country/Region').mean().reset_index()
df_lat_long

In [None]:
columns_date = df_confirmed.columns[4:]
columns_date

In [None]:
df_confirmed = df_confirmed.groupby(by='Country/Region').sum().reset_index().set_index('Country/Region')
df_confirmed

In [None]:
df_death.drop(labels=['Province/State'], axis=1, inplace=True)
df_death.head()

do the samething to the death dataframe

In [None]:
df_death.drop(labels=['Lat', 'Long'], axis=1, inplace=True)
df_death = df_death.groupby(by='Country/Region').sum().reset_index().set_index('Country/Region')
df_death

## Correct countries names

In [None]:
to_replace = {
    'United States' : 'US',
    'DR Congo' : 'Congo (Kinshasa)',
    'Congo' : 'Congo (Brazzaville)',
    'Czech Republic (Czechia)' : 'Czechia',
    'South Korea' : 'Korea, South',
    'Taiwan' : 'Taiwan*'
}
              
world_pop_index = list(df_world_pop.index)

for k, v in to_replace.items():
    i = world_pop_index.index(k)
    world_pop_index[i] = v

df_world_pop.index = world_pop_index
df_world_pop.head()

## Recompute the Density (P/Km²)
Holy See had 0 km²

In [None]:
df_world_pop.loc['Holy See', 'Land Area (Km²)'] = 0.44 # it was 0!

df_world_pop['Density (P/Km²)'] = df_world_pop['Population  (2023)'] / df_world_pop['Land Area (Km²)']
df_world_pop.sort_values(by='Density (P/Km²)', ascending=False)

# save data into files, for offline working if necessary

In [None]:
df_confirmed.to_csv('./data/confirmed.csv')
df_death.to_csv('./data/death.csv')
df_world_pop.to_csv('./data/world_pop.csv')

# get the top `number_of_countries` more affected coutries (and Portugal!)

In [None]:
number_of_countries=20

list_top_affected_countries = df_confirmed.sort_values(
                        by=df_confirmed.columns[-1], 
                        ascending=False
                    ).head(number_of_countries).index

list_top_affected_countries = list_top_affected_countries.append(pd.Index(['Portugal']))

list_top_affected_countries

# Evolution of the absolute number of confirmed cases by country (top affected countries & Portugal)

In [None]:
df_confirmed.loc[list_top_affected_countries].T.plot(
                                    figsize=(15, 10), 
                                    style=styles_colors,
                                    logy=True,
                                    title='Number of confirmed cases evolution (top 20 countries & Portugal)'
                                )

# Evolution of the absolute number of deaths by country (top affected countries)

In [None]:
df_death.loc[list_top_affected_countries].T.plot(
                    figsize=(15, 10), 
                    style=styles_colors,
                    logy=True,
                    title='Number of deaths evolution (top 20 countries & Portugal)'
                )

# Number of death per confirmed case (top affected countries)

In [None]:
df_death_by_confirmed = df_death.loc[list_top_affected_countries] / df_confirmed.loc[list_top_affected_countries]
df_death_by_confirmed = df_death_by_confirmed.fillna(0)*100

df_death_by_confirmed.T.plot(
                            figsize=(15, 10), 
                            ylim=(-0.1, 18), 
                            style=styles_colors, 
                            title='Percentage of death by confirmed case'
                        )

# Confirmed/death data relative to the population size (top affected countries & Portugal)

First data is normalized by population size

In [None]:
df_confirmed.loc[list_top_affected_countries]

In [None]:
df_world_pop.loc[list_top_affected_countries]['Population  (2023)']

In [None]:
df_confirmed_by_pop = df_confirmed.loc[list_top_affected_countries].div(
                                df_world_pop.loc[list_top_affected_countries]['Population  (2023)'],
                                axis=0
                            )

df_death_by_pop = df_death.loc[list_top_affected_countries].div(
                                df_world_pop.loc[list_top_affected_countries]['Population  (2023)'],
                                axis=0
                            )

In [None]:
df_confirmed_by_pop.T.plot(
                            figsize=(15, 10), 
                            style=styles_colors,
                            title='confirmed case by population size ratio (top 20)',
                            logy=True
                        )

In [None]:
df_death_by_pop.T.plot(
            figsize=(15, 10), 
            style=styles_colors,
            title='deaths by population size ratio (top 20 countries)',
            logy=True
        )

# Confirmed/death data relative to the country size (top affected countries)

First data is normalized by coutries' area

In [None]:
df_confirmed_by_size = df_confirmed.loc[list_top_affected_countries].div(
                                df_world_pop.loc[list_top_affected_countries]['Land Area (Km²)'],
                                axis=0
                            )
df_death_by_size = df_death.loc[list_top_affected_countries].div(
                                df_world_pop.loc[list_top_affected_countries]['Land Area (Km²)'],
                                axis=0
                            )

In [None]:
df_confirmed_by_size.T.plot(
                                figsize=(15, 10), 
                                logy=True, 
                                style=styles_colors,
                                title="number of confirmed cases relative to the countries' sizes"
                            )

In [None]:
df_death_by_size.T.plot(
                    figsize=(15, 10), 
                    logy=True, 
                    style=styles_colors,
                    title="number of deaths relative to the countries' sizes",
                    grid=True
                )

# Growing rate over time (top affected countries)

In [None]:
df_today = df_confirmed.loc[list_top_affected_countries, df_confirmed.columns[1:]].astype(float)
# print(df_today)
df_previous_day = df_confirmed.loc[list_top_affected_countries, df_confirmed.columns[:-1]].astype(float)
# print(df_previous_day)

df_confirmed_growth_rate =  ((df_today - df_previous_day.values) / df_previous_day.values).replace(np.inf, np.nan) + 1
df_confirmed_growth_rate.head()

In [None]:
df_confirmed_growth_rate.T.plot(
                     figsize=(30, 15), 
                    style=styles_colors,
                    title="growth rate of confirmed cases",
                    ylim=(1, 1.1),
                    grid=True
                    )

# Growing rate (top affected countries)

In [None]:
# %matplotlib inline

number_of_amortization_days = 10

def animate(i, *args, **kargs):
    df, df_delta, max_x, max_y, dates, x_label = args
    i += number_of_amortization_days 
    today = dates[i]
    delta_today = 'delta_' + today

    fig.clear()
    plt.xscale('log')
    plt.yscale('log')
    plt.xlim(1, max_x)
    plt.ylim(1, max_y)
    plt.xlabel(x_label)
    plt.ylabel('Absolute growth')
    plt.title(f'{today}')
    
    
    for idx, country in enumerate(df.index):
        xx = df.loc[country, dates[:i]].values
        yy = df_delta.loc[country, dates[:i]].values
        plt.plot(xx, yy, styles_colors[idx])
        plt.annotate(country, (xx[-1], yy[-1]))
    
def prepare_data(df_in):
    # get the dates
    dates = df_in.columns
    print(dates)

    # copy the dataframe
    df = df_in.loc[list_top_affected_countries].copy()

    # to avoid errors, replace 0 and NaN by a very small value
    df.fillna(1)
    df[df == 0] = 1

    # create dataframe to hold variations
    df_delta = pd.DataFrame()
    # add the delta (variation) columns, amortized by the computation of the mean of the last days variation
    # it's supoposed the series of confirmed cases in crescent
    for i_today in range(number_of_amortization_days, len(dates)):
        df_delta[dates[i_today]] = (df[dates[i_today]] - df[dates[i_today - number_of_amortization_days]]) / number_of_amortization_days
    df_delta.fillna(1)
    dates = df_delta.columns

    # maximum number of confirmed cases
    max_x = df[dates].max().max()
    # maximum variation
    max_y = df_delta.max().max()

    print(f'max delta: {max_y}  max confirmed: {max_x}')
    
    return df, df_delta, max_x, max_y, dates

In [None]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128

df, df_delta, max_x, max_y, dates = prepare_data(df_confirmed)
x_label = 'Number of confirmed cases'

Writer = animation.writers['ffmpeg']
writer = Writer(fps=2, metadata=dict(artist='pcardoso@ualg.pt'))

plt.ioff()
fig = plt.figure(figsize=(10, 10))
ani = animation.FuncAnimation(fig, animate, frames=len(df_delta.columns)-10, fargs=(df, df_delta, max_x, max_y, dates, x_label), repeat=False, repeat_delay=5)
ani.save('evolution.mp4', writer=writer)
HTML(ani.to_jshtml())

In [None]:
# %matplotlib inline 

df, df_delta, max_x, max_y, dates = prepare_data(df_death)
x_label = 'Number of death'

Writer = animation.writers['ffmpeg']
writer = Writer(fps=2, metadata=dict(artist='pcardoso@ualg.pt'))

fig = plt.figure(figsize=(10, 10))
ani = animation.FuncAnimation(fig, animate, frames=len(df_delta.columns), fargs=(df, df_delta, max_x, max_y, dates, x_label), repeat=False, repeat_delay=5)
ani.save('evolution_death.mp4', writer=writer)
HTML(ani.to_jshtml())