## Import Libraries

In [4]:

import os
from pathlib import Path

# Display options
from IPython.display import display, Math, Latex

# Data processing
import numpy as np
import pandas as pd

# Data visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.dates as mdates
import seaborn as sns
sns.set()

# Additional resources
import iso3166
from iso3166 import countries, countries_by_name # ISO data for countries
import world_bank_data as wb # World Bank Data for population data

In [5]:
from scipy.stats.mstats import gmean
from numpy import log as ln

In [6]:
notebook_path = os.path.abspath("Notebook.ipynb")
project_path = Path(notebook_path).parent.parent

data_path = os.path.join(project_path, 'data')
external_data_path = os.path.join(data_path, 'external', '')
processed_data_path = os.path.join(data_path, 'processed', '')

In [7]:
{
    'ALB': '2020-03-12',
    'AND': '2020-03-12',
    'AUT': '2020-03-05',
    'BEL': '2020-03-17',
    'BGR': '2020-04-09',
    'BIH': '2020-03-19',
    'CHE': '2020-03-16',
    'CZE': '2020-03-14',
    'DEU': '2020-03-08',
    'DNK': '2020-03-02',
    'ESP': '2020-03-13',
    'EST': '2020-03-28',
    'FIN': '2020-03-15',
    'FRA': '2020-03-16',
    'GBR': '2020-03-22',
    'GRC': '2020-03-22',
    'HRV': '2020-03-22',
    'HUN': '2020-03-27',
    'IRL': '2020-03-25',
    'ISL': '2020-05-04',
    'ITA': '2020-02-22',
    'LUX': '2020-03-16',
    'MDA': '2020-03-23',
    'NLD': '2020-03-11',
    'NOR': '2020-05-04',
    'POL': '2020-03-30',
    'PRT': '2020-04-17',
    'ROU': '2020-03-11',
    'SMR': '2020-04-16',
    'SRB': '2020-03-14',
    'SVK': '2020-03-11',
    'SVN': '2020-03-13',
    'SWE': '2020-05-04',
    'UKR': '2020-03-19'
}

{'ALB': '2020-03-12',
 'AND': '2020-03-12',
 'AUT': '2020-03-05',
 'BEL': '2020-03-17',
 'BGR': '2020-04-09',
 'BIH': '2020-03-19',
 'CHE': '2020-03-16',
 'CZE': '2020-03-14',
 'DEU': '2020-03-08',
 'DNK': '2020-03-02',
 'ESP': '2020-03-13',
 'EST': '2020-03-28',
 'FIN': '2020-03-15',
 'FRA': '2020-03-16',
 'GBR': '2020-03-22',
 'GRC': '2020-03-22',
 'HRV': '2020-03-22',
 'HUN': '2020-03-27',
 'IRL': '2020-03-25',
 'ISL': '2020-05-04',
 'ITA': '2020-02-22',
 'LUX': '2020-03-16',
 'MDA': '2020-03-23',
 'NLD': '2020-03-11',
 'NOR': '2020-05-04',
 'POL': '2020-03-30',
 'PRT': '2020-04-17',
 'ROU': '2020-03-11',
 'SMR': '2020-04-16',
 'SRB': '2020-03-14',
 'SVK': '2020-03-11',
 'SVN': '2020-03-13',
 'SWE': '2020-05-04',
 'UKR': '2020-03-19'}

# COVID-19 Data

## Get Corona Dataset

- Load raw data from GitHub repository -> raw_data_all
- Inspect dataset

### Load Dataset

In [8]:
# Load Dataset (Source: RamiKrispin GitHub)

# Assign repository URL
dataset_url = 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'

# Load dataset to dataframe
raw_data_all = pd.read_csv(dataset_url)

 ### Inspect Dataset

In [9]:
# Print dataframe head
raw_data_all.head()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type
0,,Afghanistan,33.0,65.0,2020-01-22,0,confirmed
1,,Afghanistan,33.0,65.0,2020-01-23,0,confirmed
2,,Afghanistan,33.0,65.0,2020-01-24,0,confirmed
3,,Afghanistan,33.0,65.0,2020-01-25,0,confirmed
4,,Afghanistan,33.0,65.0,2020-01-26,0,confirmed


In [10]:
# Print basic information about dataset
print('Size/Shape of the dataset (rows,cols): ', raw_data_all.shape)
print('\n')
print('Missing values per column:')
print(raw_data_all.isnull().sum())
print('\n')
print('Data type of each column:')
print(raw_data_all.dtypes)

Size/Shape of the dataset (rows,cols):  (79968, 7)


Missing values per column:
Province.State    56406
Country.Region        0
Lat                   0
Long                  0
date                  0
cases                 0
type                  0
dtype: int64


Data type of each column:
Province.State     object
Country.Region     object
Lat               float64
Long              float64
date               object
cases               int64
type               object
dtype: object


In [11]:
# Total number of reported Provinces/States
province_list = raw_data_all['Province.State'].unique()
print("The total number of provinces/states with COVID-19 cases = {}".format(province_list.size))

The total number of provinces/states with COVID-19 cases = 83


In [12]:
# Total number of reported Countries/Regions
country_list = raw_data_all['Country.Region'].unique()
print("Total number of countries with COVID-19 cases = {}".format(country_list.size))

Total number of countries with COVID-19 cases = 187


In [13]:
# Date range in 'date'
print("First date in dataset = {}".format(raw_data_all['date'].min()))
print("Last date in dataset = {}".format(raw_data_all['date'].max()))

First date in dataset = 2020-01-22
Last date in dataset = 2020-05-02


In [14]:
# Unique values in 'type'
print("Types of reported cases: {}".format(raw_data_all['type'].unique()))

Types of reported cases: ['confirmed' 'death' 'recovered']


## Pre-Processing

- Merge Province.State into Country.Region column
- Get ISO Data & World Bank Data about countries
- Split dataset into 'confirmed', 'death', 'recovered'

### Inspect Country data

In [15]:
raw_data_all['Country.Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi', 'Cabo Verde',
       'Cambodia', 'Cameroon', 'Central African Republic', 'Chad',
       'Chile', 'Colombia', 'Comoros', 'Congo (Brazzaville)',
       'Congo (Kinshasa)', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Diamond Princess',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 

### Remove Cruise Ships

In [16]:
raw_data_all.shape

(79968, 7)

In [17]:
# Cruise ships
ships = ['Diamond Princess', 'MS Zaandam', 'Grand Princess']

In [18]:
# Remove cruise ship data from dataset
for ship in ships:
    # Create list of index labels
    index_names = raw_data_all[(raw_data_all['Province.State'] == ship) | (raw_data_all['Country.Region'] == ship)].index
    # Delete rows with labels
    raw_data_all.drop(index_names, inplace=True)

In [19]:
raw_data_all.shape

(78948, 7)

### Clean up Provice.State

**Goal:** only Country should remain to specify location

In [20]:
# Create temporary data frame: Select all row where 'Province.State' != NaN
temp_df_notna = raw_data_all[pd.notna(raw_data_all['Province.State'])]

In [21]:
# Provinces: Sum up data in Provinces of China and Australia
temp_df_province = temp_df_notna[(temp_df_notna['Country.Region'] == 'China') | (temp_df_notna['Country.Region'] == 'Australia')]

# Group data by Country.Region, data and type - this removes Province.State column
# Sum cases within groups
# Ungroup to reset index
temp_df_province = temp_df_province.groupby(['Country.Region', 'date', 'type']) \
                                    .sum() \
                                    .reset_index() 

In [22]:
# Rename former Colonies: Select all data where Country is not China, Australia or Canada. Delete original Country name column and replace it with Province names.
# Select all rows where Country is neither China, nor Australia or Canada
# Delete column 'Country.Region' and rename 'Province.State' column to 'Country.Region'
temp_df_colonies = temp_df_notna[(temp_df_notna['Country.Region'] != 'China') 
                                 & (temp_df_notna['Country.Region'] != 'Australia')
                                & (temp_df_notna['Country.Region'] != 'Canada')] \
                                .drop('Country.Region', axis=1) \
                                .rename(columns={'Province.State': 'Country.Region'})

In [23]:
# Province == NaN: Remove Province.State column from all rows where Country is not Canada
# Select all rows where Country != Canada
temp_df_not_canada = raw_data_all[raw_data_all['Country.Region'] != 'Canada']

# Select all rows where Province is NaN
temp_df_not_canada = temp_df_not_canada[temp_df_not_canada['Province.State'].isna()]

# Delete Province column
temp_df_not_canada = temp_df_not_canada.drop('Province.State', axis=1)

In [24]:
# Canada: Select all rows where Country.Region is Canada
temp_df_canada = raw_data_all[raw_data_all['Country.Region'] == 'Canada']

# Group data by Country.Region, data and type - this removes Province.State column
# Sum cases within groups
# Ungroup to reset index
temp_df_canada = temp_df_canada.groupby(['Country.Region', 'date', 'type']) \
                                    .sum() \
                                    .reset_index()

In [25]:
# Print all shapes
print("Shape where Country is not Canada:")
print(temp_df_not_canada.shape)
print("Shape where Country is Canada:")
print(temp_df_canada.shape)
print("Shape of Provinces in China or Australia:")
print(temp_df_province.shape)
print("Shape of Colonies:")
print(temp_df_colonies.shape)

Shape where Country is not Canada:
(55692, 6)
Shape where Country is Canada:
(306, 6)
Shape of Provinces in China or Australia:
(612, 6)
Shape of Colonies:
(7956, 6)


In [26]:
print("Sum of all rows of all temporary dataframes: {}".format(temp_df_not_canada.shape[0] + temp_df_canada.shape[0] + temp_df_province.shape[0] + temp_df_colonies.shape[0]))
print("Number of rows in raw data: {}".format(raw_data_all.shape[0]))

# Join temp dataframes together
df_countries = pd.concat([temp_df_not_canada, temp_df_province, temp_df_colonies, temp_df_canada], axis=0, sort=True)

print("Joined dataset:")
print(df_countries.head())
print(df_countries.shape)
# Delete temporary data frames from memory
del temp_df_not_canada, temp_df_province, temp_df_colonies, temp_df_canada

Sum of all rows of all temporary dataframes: 64566
Number of rows in raw data: 78948
Joined dataset:
  Country.Region   Lat  Long  cases        date       type
0    Afghanistan  33.0  65.0      0  2020-01-22  confirmed
1    Afghanistan  33.0  65.0      0  2020-01-23  confirmed
2    Afghanistan  33.0  65.0      0  2020-01-24  confirmed
3    Afghanistan  33.0  65.0      0  2020-01-25  confirmed
4    Afghanistan  33.0  65.0      0  2020-01-26  confirmed
(64566, 6)


#### Update Country Names to Official Names

In [27]:
# Function returns error at missing country name as string
# The error equals the name of the object in the search
def check_country_names(country):
    try:
        countries.get(country)
    except KeyError as error:
        return eval(str(error))

In [28]:
# Create list of all countries that cannot be found by their names
countries_not_found = [check_country_names(country) for country in df_countries['Country.Region'].unique()]
# Remove all None values (countries whee no error occured) from list
countries_not_found = list(filter(None, countries_not_found))

In [29]:
# Search coordinates of Channel Islands to get official country name
# Import library for reverse geocoding
import reverse_geocoder as rg

# Search for tuple of Lat, Long where Country.Region == Channel Islands
rg.search((49.3723, -2.3644))

Loading formatted geocoded file...


[OrderedDict([('lat', '49.45981'),
              ('lon', '-2.53527'),
              ('name', 'Saint Peter Port'),
              ('admin1', 'St Peter Port'),
              ('admin2', ''),
              ('cc', 'GG')])]

In [30]:
official_country_names = [
    'Bolivia, Plurinational State of',# Bolivia
    'Brunei Darussalam',# Brunei
    'Myanmar',# Burma
    'Congo',# Congo (Brazzaville)
    'Congo, Democratic Republic of the',# Congo (Kinshasa)
    "Côte d'Ivoire",# Cote d'Ivoire
    'Iran, Islamic Republic of',# Iran
    'Korea, Republic of',# Korea, South
    "Lao People's Democratic Republic",# Laos
    'Moldova, Republic of',# Moldova
    'Russian Federation',# Russia
    'Syrian Arab Republic',# Syria
    'Taiwan, Province of China',# Taiwan*
    'Tanzania, United Republic of',# Tanzania
    'United Kingdom of Great Britain and Northern Ireland',# United Kingdom
    'Venezuela, Bolivarian Republic of',# Venezuela
    'Viet Nam',# Vietnam
    'Palestine, State of',# West Bank and Gaza
    'Virgin Islands, British',# British Virgin Islands
    'Guernsey',# Channel Islands: Estimated by coordinates as Saint Peter Port on Guernsey
    'Curaçao',# Curacao
    'Réunion',# Reunion
    'Saint Barthélemy',# Saint Barthelemy
    'Sint Maarten (Dutch part)',# Sint Maarten
    'Saint Martin (French part)'# St Martin
]

In [31]:
# Create a zip object from two lists
# Create a dictionary from zip object
missing_countries_dict = dict(zip(countries_not_found, official_country_names))

In [32]:
for key, value in missing_countries_dict.items():
    df_countries.loc[(df_countries['Country.Region'] == key), 'Country.Region'] = value

### Add Display Name Column for Shorter Names in Figures

In [33]:
display_name_dict = {
    'Bolivia': 'Bolivia, Plurinational State of',
    'Brunei': 'Brunei Darussalam',
    'Congo': 'Congo',
    'DR Congo': 'Congo, Democratic Republic of the',
    'Iran': 'Iran, Islamic Republic of',
    'South Korea': 'Korea, Republic of',
    'Laos': "Lao People's Democratic Republic",
    'Moldova': 'Moldova, Republic of',
    'Russia': 'Russian Federation',
    'Syria': 'Syrian Arab Republic',
    'Taiwan': 'Taiwan, Province of China',
    'Tanzania': 'Tanzania, United Republic of',
    'United Kingdom': 'United Kingdom of Great Britain and Northern Ireland',
    'United States': 'US',
    'Venezuela': 'Venezuela, Bolivarian Republic of',
    'Vietnam': 'Viet Nam',
    'Palestine': 'Palestine, State of',
    'British Virgin Islands': 'Virgin Islands, British',
    'Channel Islands': 'Guernsey',
    'Sint Maarten': 'Sint Maarten (Dutch part)',
    'St Martin': 'Saint Martin (French part)'
}

In [34]:
# Add column from Country.Region, but for countries not found use keys in missing_countries_dict
df_countries['DisplayName'] = df_countries['Country.Region']

for key, value in display_name_dict.items():
    df_countries.loc[(df_countries['Country.Region'] == value), 'DisplayName'] = key

### Add ISO codes to countries

In [35]:
# Get 2-digit country codes
df_countries['iso2Code'] = [countries.get(row).alpha2 for row in df_countries['Country.Region']]

# Get 3-digit country codes
df_countries['iso3Code'] = [countries.get(row).alpha3 for row in df_countries['Country.Region']]

# Get numeric country codes
# For usage with UN dtataset
df_countries['isoNumCode'] = [countries.get(row).numeric for row in df_countries['Country.Region']]

In [36]:
# Load dataset with countries and continents
# Source: https://datahub.io/JohnSnowLabs/country-and-continent-codes-list#python
# Assign filename
continents_url = os.path.join(external_data_path, 'country-and-continent-codes-list-csv_csv.csv')

# Load dataset to dataframe
continents_data = pd.read_csv(continents_url)

In [37]:
# Create dict from continents dataframe
codes = continents_data['Three_Letter_Country_Code']
continents = continents_data['Continent_Name']

continents_dict = dict(zip(codes, continents))

In [38]:
# Map continents dictionary to data frame
df_countries['Continent'] = df_countries['iso3Code'].map(continents_dict)

In [39]:
# List all unique values in 'Continent'
df_countries['Continent'].unique()

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania', nan], dtype=object)

In [40]:
# Return Country.Region where Continent is NaN
df_countries[df_countries['Continent'].isna()]['Country.Region'].unique()

array(['Kosovo'], dtype=object)

In [41]:
# Add 'Europe' as continent for Kosovo
df_countries.loc[(df_countries['Country.Region'] == 'Kosovo'), 'Continent'] = 'Europe'

In [42]:
#df_countries[df_countries['Country.Region'] == 'Kosovo']
df_countries['Continent'].unique()

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)

In [43]:
# Convert ISO numeric code to number
df_countries['isoNumCode'] = pd.to_numeric(df_countries['isoNumCode'])

In [44]:
df_countries.head()

Unnamed: 0,Country.Region,Lat,Long,cases,date,type,DisplayName,iso2Code,iso3Code,isoNumCode,Continent
0,Afghanistan,33.0,65.0,0,2020-01-22,confirmed,Afghanistan,AF,AFG,4,Asia
1,Afghanistan,33.0,65.0,0,2020-01-23,confirmed,Afghanistan,AF,AFG,4,Asia
2,Afghanistan,33.0,65.0,0,2020-01-24,confirmed,Afghanistan,AF,AFG,4,Asia
3,Afghanistan,33.0,65.0,0,2020-01-25,confirmed,Afghanistan,AF,AFG,4,Asia
4,Afghanistan,33.0,65.0,0,2020-01-26,confirmed,Afghanistan,AF,AFG,4,Asia


In [45]:
# Print basic information about dataset
print('Size/Shape of the dataset (rows,cols): ', df_countries.shape)
print('\n')
print('Missing values per column:')
print(df_countries.isnull().sum())
print('\n')
print('Data type of each column:')
print(df_countries.dtypes)

Size/Shape of the dataset (rows,cols):  (64566, 11)


Missing values per column:
Country.Region    0
Lat               0
Long              0
cases             0
date              0
type              0
DisplayName       0
iso2Code          0
iso3Code          0
isoNumCode        0
Continent         0
dtype: int64


Data type of each column:
Country.Region     object
Lat               float64
Long              float64
cases               int64
date               object
type               object
DisplayName        object
iso2Code           object
iso3Code           object
isoNumCode          int64
Continent          object
dtype: object


---

# Population Data

- Add column with total population

In [46]:
# Load dataset
# Source: UN 2019 Revision of World Population Prospects 
# (https://population.un.org/wpp/Download/Standard/CSV/)
# https://population.un.org/wpp/DefinitionOfProjectionVariants/

pop_filename = os.path.join(external_data_path, 'WPP2019_TotalPopulationBySex.csv')

population_data_raw = pd.read_csv(pop_filename)

## Add Total Population (and Income Levels) to Countries

In [47]:
len(df_countries['isoNumCode'].unique())

211

In [48]:
# population-by-country-2020
population_data = population_data_raw[population_data_raw['Time'] == 2018].groupby(['LocID', 'Location', 'Time', 'PopTotal']).sum().reset_index()
population_data.drop(['Location', 'Time','VarID', 'MidPeriod', 'PopMale', 'PopFemale', 'PopDensity'], axis=1, inplace=True)
population_data.head()

Unnamed: 0,LocID,PopTotal
0,4,37171.922
1,8,2882.735
2,12,42228.415
3,16,55.461
4,20,77.008


In [49]:
# PopTotal in 1000s
# Update value to single units
population_data['PopTotal'] = population_data['PopTotal'] * 1000

In [50]:
# check if all num codes in df_countries present in population_data
print(set(df_countries['isoNumCode']) - set(population_data['LocID']))
len(set(df_countries['isoNumCode']) - set(population_data['LocID']))

{983, 831}


2

In [51]:
# Create dictionary of LocID and PopTotal
# Use set_index to set ID columns as the dataframe index.
population_dict = population_data.set_index('LocID').to_dict()

In [52]:
# Map continents dictionary to data frame
df_countries['TotalPopulation'] = df_countries['isoNumCode'].map(population_dict['PopTotal'])

In [53]:
# List countries where TotalPopulation is NaN
df_countries[df_countries['TotalPopulation'].isnull()]['Country.Region'].unique()

array(['Kosovo', 'Guernsey'], dtype=object)

In [54]:
# Kosovo: 1.907.592 (Source: https://de.wikipedia.org/wiki/Kosovo referencing CIA - The World Factbook July 2018)
df_countries.loc[df_countries['Country.Region'] == 'Kosovo', 'TotalPopulation'] = 1907592

# Channel Islands: 166.000 (Source: https://de.wikipedia.org/wiki/Kanalinseln)
df_countries.loc[df_countries['Country.Region'] == 'Guernsey', 'TotalPopulation'] = 166000

In [55]:
df_countries.head()

Unnamed: 0,Country.Region,Lat,Long,cases,date,type,DisplayName,iso2Code,iso3Code,isoNumCode,Continent,TotalPopulation
0,Afghanistan,33.0,65.0,0,2020-01-22,confirmed,Afghanistan,AF,AFG,4,Asia,37171922.0
1,Afghanistan,33.0,65.0,0,2020-01-23,confirmed,Afghanistan,AF,AFG,4,Asia,37171922.0
2,Afghanistan,33.0,65.0,0,2020-01-24,confirmed,Afghanistan,AF,AFG,4,Asia,37171922.0
3,Afghanistan,33.0,65.0,0,2020-01-25,confirmed,Afghanistan,AF,AFG,4,Asia,37171922.0
4,Afghanistan,33.0,65.0,0,2020-01-26,confirmed,Afghanistan,AF,AFG,4,Asia,37171922.0


## Additional Variables

- Split dataset into 'confirmed', 'death' & 'recovered'
- Rename columns
- Reorder columns
- Calculate cumulative sums
- Calculate relative to 100K for daily cases & cumulative cases

### Calculate Cumulative Sums

In [57]:
# Add Cumulative Sums
# Create list of unique type values
types = list(df_countries['type'].unique())

# Create empty dictionary for dataframes
d = {}

# Create list with grouping variables for cumsum
group_vars = ['Country.Region', 'Lat', 'Long', 'iso2Code',
       'iso3Code', 'isoNumCode', 'TotalPopulation' ]

# Loop through types
for case_type in types:
    # Select rows where 'type' == case_type
    df = df_countries[df_countries['type'] == case_type]
    # Rename column 'cases' to 'daily_reported_cases'
    df = df.rename(columns={'cases': 'daily_reported_cases'})
    # Reset index (drop current index)
    df = df.reset_index(drop=True)
    # Add column & apply function to calculate cumsum
    df['cumulative_cases'] = df.groupby(group_vars)['daily_reported_cases'].apply(lambda x: x.cumsum())
    # Add df to dict
    d['df_{}'.format(case_type)] = df

### Calculate Daily & Cumulative Cases relative to Population Size (per 100K)


Calculating cases per 100K:
$$
\frac{\text{Number of Cases}}{\text{Total Population}} \times 100000
$$

In [58]:
# Function to calculate cases per 100K
def calculate_per_100K(n, popTotal):
    return (n / popTotal) * 100000

In [59]:
# Iterate over all data frames in dictionary
for value in d.values():
    # Calculate daily reported cases per 100K
    value['daily_per_100K'] = value.apply(lambda x: calculate_per_100K(x['daily_reported_cases'], x['TotalPopulation']), axis=1)
    # Calculate cumulative  cases per 100K
    value['cumulative_per_100K'] = value.apply(lambda x: calculate_per_100K(x['cumulative_cases'], x['TotalPopulation']), axis=1)

### Update Column Names and Reorder Columns

| current name  | new name |
|---|---|
| Country.Region  | Country  |
|   | DisplayName  |
| Lat  | Latitude  |
| Long  | Longitude  |
| iso2Code  |  ISO2Code |
| iso3Code  | ISO3Code  |
| isoNumCode  | ISONumCode  |
| TotalPopulation  | TotalPopulation  |
| Continent  | Continent  |
| date  | Date  |
| type  | CaseType  |
| cases  | DailyReportedCases  |
| cumulative_cases  | CumulativeReportedCases  |
| daily_per_100K  | DailyReportedCasesPer100K  |
| cumulative_per_100K  | CumulativeReportedCasesPer100K  |


In [60]:
# Iterate over all data frames in dictionary
for df in d.values():
    df.rename(columns={'Country.Region': 'Country', 'Lat': 'Latitude', 'Long': 'Longitude', 
                   'daily_reported_cases': 'DailyReportedCases', 'date': 'Date', 'type': 'CaseType', 'iso2Code': 'ISO2Code', 
                   'iso3Code': 'ISO3Code', 'isoNumCode': 'ISONumCode', 'cumulative_cases': 'CumulativeReportedCases',
                   'daily_per_100K': 'DailyReportedCasesPer100K', 'cumulative_per_100K': 'CumulativeReportedCasesPer100K'}, inplace=True)


In [61]:
# Drop 'type' column in all data frames
#for df in d.values():
#    df.drop('type', axis=1, inplace=True)

In [62]:
# Reorder Columns
ordered_cols = ['Country', 'DisplayName', 'Latitude', 'Longitude', 
                'ISO2Code', 'ISO3Code', 'ISONumCode','TotalPopulation', 'Continent', 'Date', 
                'CaseType', 'DailyReportedCases', 'CumulativeReportedCases',
       'DailyReportedCasesPer100K', 'CumulativeReportedCasesPer100K']

d['df_confirmed'] = d['df_confirmed'][ordered_cols]
d['df_death'] = d['df_death'][ordered_cols]
d['df_recovered'] = d['df_recovered'][ordered_cols]

### Calculate Growth Factors

In [63]:
group_vars = ['Country', 'DisplayName', 'Latitude', 'Longitude', 'ISO2Code',
       'ISO3Code', 'ISONumCode', 'TotalPopulation']

for value in d.values():
    value['GrowthFactor'] = value.groupby(group_vars)['CumulativeReportedCases'].apply(lambda x: x.pct_change() + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Calculate Doubline Times in Rolling Windows ($\Delta t$ = 7 days)

In [64]:
group_vars = ['Country', 'DisplayName', 'Latitude', 'Longitude', 'ISO2Code',
       'ISO3Code', 'ISONumCode', 'TotalPopulation', 'Continent', 'Date', 'CaseType',
       'DailyReportedCases', 'CumulativeReportedCases',
       'DailyReportedCasesPer100K', 'CumulativeReportedCasesPer100K']

for key, value in d.items():
    grp = pd.DataFrame(value.groupby(group_vars)['GrowthFactor'].sum())
    grp['GF_RollingGeomMean'] = grp.rolling(7).apply(gmean, raw=True)
    grp.reset_index(inplace=True)
    d[key] = grp

del grp

In [65]:
def calculate_doubling_time(geom_mean):
    return ln(2)/ln(geom_mean)

In [66]:
for value in d.values():
    value['DoublingTime'] = value['GF_RollingGeomMean'].apply(lambda x: calculate_doubling_time(x))

---

# World

## Create Date Frame for World

In [89]:
# Create data frame for world-wide case numbers
df_world = df_countries.groupby(['type', 'date']).sum()

# Drop unnecessary numeric columns
cols_to_drop = ['Lat', 'Long', 'isoNumCode']
df_world.drop(cols_to_drop, axis=1, inplace=True)

In [90]:
# Create empty dictionary for world-data frames
d_world = {}

for case_type in df_countries['type'].unique():
    # Select rows where index label'type' == case_type & reset index(ungroup)
    # Add selected rows to data frame in dictionary
    d_world['df_{}'.format(case_type)] = df_world.loc[[case_type], :].reset_index()
    

### Calculate Cumulative Sums World-Wide

In [91]:
# Iterate over all data frames in dictionary
for key, value in d_world.items():
    # Rename cases column
    value.rename(columns={'cases': 'daily_reported_cases'}, inplace=True)
    # Calculate cumsum of cases & assign to new column in df
    value['cumulative_cases'] = value['daily_reported_cases'].cumsum()

### Calculate Daily & Cumulative Cases relative to Population Size (per 100K) World-Wide

In [92]:
# Iterate over all data frames in dictionary
for value in d_world.values():
    # Calculate daily reported cases per 100K
    value['daily_per_100K'] = value.apply(lambda x: calculate_per_100K(x['daily_reported_cases'], x['TotalPopulation']), axis=1)
    # Calculate cumulative  cases per 100K
    value['cumulative_per_100K'] = value.apply(lambda x: calculate_per_100K(x['cumulative_cases'], x['TotalPopulation']), axis=1)

In [99]:
d_world['df_confirmed']

Unnamed: 0,CaseType,TotalPopulation,Date,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K
0,confirmed,7.586450e+09,2020-01-22,555,555,0.007316,0.007316
1,confirmed,7.586450e+09,2020-01-23,99,654,0.001305,0.008621
2,confirmed,7.586450e+09,2020-01-24,287,941,0.003783,0.012404
3,confirmed,7.586450e+09,2020-01-25,493,1434,0.006498,0.018902
4,confirmed,7.586450e+09,2020-01-26,684,2118,0.009016,0.027918
...,...,...,...,...,...,...,...
97,confirmed,7.586450e+09,2020-04-28,73468,3096456,0.968411,40.815614
98,confirmed,7.586450e+09,2020-04-29,75097,3171553,0.989883,41.805497
99,confirmed,7.586450e+09,2020-04-30,84566,3256119,1.114698,42.920195
100,confirmed,7.586450e+09,2020-05-01,86923,3343042,1.145767,44.065962


### Update Column Names and Reorder Columns World-Wide

In [96]:
for df in d_world.values():
    # Drop column 'type'
    #df.drop('type', axis=1, inplace=True)
    # Rename columns
    df.rename(columns={'type': 'CaseType','date': 'Date', 'daily_reported_cases': 'DailyReportedCases', 
                       'cumulative_cases': 'CumulativeReportedCases', 'daily_per_100K': 'DailyReportedCasesPer100K', 
                       'cumulative_per_100K': 'CumulativeReportedCasesPer100K'}, inplace=True)

In [98]:
# Move column 'TotalPopulation' before case number columns
for df in d_world.values():
    col_to_move = df['TotalPopulation']
    df.drop(labels=['TotalPopulation'], axis=1, inplace=True)
    df.insert(1, 'TotalPopulation', col_to_move)

---

## Export pre-processed Datasets

In [67]:
# Join data frames in d
df = pd.concat(list(d.values()), ignore_index=True)

# Export dataframe to csv
df.to_csv(os.path.join(processed_data_path, 'covid_countries.csv'), index=False, header=True)

In [None]:
df.col

In [None]:
# Join data frames in d
df_world = pd.concat(list(d_world.values()), ignore_index=True)

# Export d_world


---

In [None]:
# Export data frame to store for use in other notebook
%store d

---

In [68]:
df.head()

Unnamed: 0,Country,DisplayName,Latitude,Longitude,ISO2Code,ISO3Code,ISONumCode,TotalPopulation,Continent,Date,CaseType,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
0,Afghanistan,Afghanistan,33.0,65.0,AF,AFG,4,37171922.0,Asia,2020-01-22,confirmed,0,0,0.0,0.0,0.0,,
1,Afghanistan,Afghanistan,33.0,65.0,AF,AFG,4,37171922.0,Asia,2020-01-23,confirmed,0,0,0.0,0.0,0.0,,
2,Afghanistan,Afghanistan,33.0,65.0,AF,AFG,4,37171922.0,Asia,2020-01-24,confirmed,0,0,0.0,0.0,0.0,,
3,Afghanistan,Afghanistan,33.0,65.0,AF,AFG,4,37171922.0,Asia,2020-01-25,confirmed,0,0,0.0,0.0,0.0,,
4,Afghanistan,Afghanistan,33.0,65.0,AF,AFG,4,37171922.0,Asia,2020-01-26,confirmed,0,0,0.0,0.0,0.0,,


In [70]:
df_eu = df[df['Continent'] == 'Europe']

In [77]:
c6_start_dates = pd.read_csv('c6_start_dates.csv', names=['Code', 'Date'], header=None)

In [81]:
c6_start_dates = c6_start_dates[c6_start_dates['Code'].notna()]

In [85]:
countries_eu = df_eu['ISO3Code'].unique()
countries_c6 = c6_start_dates['Code'].unique()

In [86]:
# check if all num codes in df_countries present in population_data
print(set(df_eu['ISO3Code']) - set(c6_start_dates['Code']))
len(set(df_eu['ISO3Code']) - set(c6_start_dates['Code']))

{'MLT', 'MNE', 'MCO', 'GGY', 'IMN', 'MKD', 'FRO', 'XKX', 'LTU', 'BLR', 'GIB', 'VAT', 'LIE', 'LVA'}


14

In [96]:
# Create dictionary of LocID and PopTotal
# Use set_index to set ID columns as the dataframe index.
eu_codes = list(c6_start_dates['Code'])
eu_dates = list(c6_start_dates['Date'])
c6_dict = dict(zip(eu_codes, eu_dates))

In [98]:
df_eu.head()

Unnamed: 0,Country,DisplayName,Latitude,Longitude,ISO2Code,ISO3Code,ISONumCode,TotalPopulation,Continent,Date,CaseType,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
102,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-01-22,confirmed,0,0,0.0,0.0,0.0,0.0,-0.0
103,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-01-23,confirmed,0,0,0.0,0.0,0.0,0.0,-0.0
104,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-01-24,confirmed,0,0,0.0,0.0,0.0,0.0,-0.0
105,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-01-25,confirmed,0,0,0.0,0.0,0.0,0.0,-0.0
106,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-01-26,confirmed,0,0,0.0,0.0,0.0,0.0,-0.0


In [99]:
end_date = '2020-04-27'

In [100]:
# Iterate over c6 dict
# for each country get cum cases > start_date & < end_date
# append to df rows
df_eu_resp_list = []

for key, val in c6_dict.items():
    print(key, val)
    mask = (df_eu['ISO3Code'] == key) & (df_eu['Date'] > val) & (df_eu['Date'] < end_date)
    df_eu_resp_list.append(df_eu.loc[mask])

ALB 2020-03-12
AND 2020-03-12
AUT 2020-03-05
BEL 2020-03-17
BGR 2020-04-09
BIH 2020-03-19
CHE 2020-03-16
CZE 2020-03-14
DEU 2020-03-08
DNK 2020-03-02
ESP 2020-03-13
EST 2020-03-28
FIN 2020-03-15
FRA 2020-03-16
GBR 2020-03-22
GRC 2020-03-22
HRV 2020-03-22
HUN 2020-03-27
IRL 2020-03-25
ISL 2020-05-04
ITA 2020-02-22
LUX 2020-03-16
MDA 2020-03-23
NLD 2020-03-11
NOR 2020-05-04
POL 2020-03-30
PRT 2020-04-17
ROU 2020-03-11
SMR 2020-04-16
SRB 2020-03-14
SVK 2020-03-11
SVN 2020-03-13
SWE 2020-05-04
UKR 2020-03-19


In [102]:
df_eu_resp = pd.concat(df_eu_resp_list)

In [103]:
df_eu_resp

Unnamed: 0,Country,DisplayName,Latitude,Longitude,ISO2Code,ISO3Code,ISONumCode,TotalPopulation,Continent,Date,CaseType,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
153,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-03-13,confirmed,10,33,0.346893,1.144746,1.434783,,
154,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-03-14,confirmed,5,38,0.173446,1.318193,1.151515,,
155,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-03-15,confirmed,4,42,0.138757,1.456950,1.105263,,
156,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-03-16,confirmed,9,51,0.312204,1.769153,1.214286,1.588306,1.498151
157,Albania,Albania,41.1533,20.1683,AL,ALB,8,2882735.0,Europe,2020-03-17,confirmed,4,55,0.138757,1.907910,1.078431,1.275752,2.846186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63433,Ukraine,Ukraine,48.3794,31.1656,UA,UKR,804,44246158.0,Europe,2020-04-22,recovered,57,424,0.128825,0.958275,1.155313,1.167973,4.464146
63434,Ukraine,Ukraine,48.3794,31.1656,UA,UKR,804,44246158.0,Europe,2020-04-23,recovered,80,504,0.180807,1.139082,1.188679,1.153043,4.867462
63435,Ukraine,Ukraine,48.3794,31.1656,UA,UKR,804,44246158.0,Europe,2020-04-24,recovered,278,782,0.628303,1.767385,1.551587,1.179650,4.195359
63436,Ukraine,Ukraine,48.3794,31.1656,UA,UKR,804,44246158.0,Europe,2020-04-25,recovered,0,782,0.000000,1.767385,1.000000,1.161019,4.642720


In [None]:
# select variable to plot
# remove all cols except selected var and date and country
# unmelt and transpose
# reset index and remove date


In [114]:
df[(df['CaseType'] == 'death') & (df['Date'] == '2020-04-27')].describe()

Unnamed: 0,Latitude,Longitude,ISONumCode,TotalPopulation,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
count,211.0,211.0,211.0,211.0,211.0,211.0,211.0,211.0,211.0,207.0,207.0
mean,26.622468,28.861186,429.180095,35954740.0,21.725118,1016.436019,0.066646,3.883175,0.817813,0.826301,inf
std,89.785196,281.720252,253.325681,140151800.0,110.002988,5157.081606,0.238199,12.188601,0.409837,0.41912,
min,-255.9695,-1114.9634,4.0,810.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.966579
25%,6.42595,-32.82305,213.0,857426.5,0.0,1.0,0.0,0.023672,1.0,1.0,7.079657
50%,18.1096,16.0,428.0,6859408.0,0.0,10.0,0.0,0.243161,1.0,1.019259,16.636912
75%,40.6482,43.3451,655.5,24983690.0,2.5,88.0,0.022861,1.822714,1.020184,1.049241,43.030661
max,1083.3367,3684.4197,983.0,1427648000.0,1378.0,56259.0,2.378885,121.359223,1.2,1.183385,inf


In [112]:
test = df[(df['Date'] == '2020-04-27')]
value = test['CumulativeReportedCasesPer100K'].max()
df[df['CumulativeReportedCasesPer100K'] == value]

Unnamed: 0,Country,DisplayName,Latitude,Longitude,ISO2Code,ISO3Code,ISONumCode,TotalPopulation,Continent,Date,CaseType,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
17027,San Marino,San Marino,43.9424,12.4578,SM,SMR,674,33784.0,Europe,2020-04-26,confirmed,25,538,73.999526,1592.469808,1.048733,1.022311,31.412754
17028,San Marino,San Marino,43.9424,12.4578,SM,SMR,674,33784.0,Europe,2020-04-27,confirmed,0,538,0.0,1592.469808,1.0,1.021995,31.859698


In [106]:
df[(df['Country'] == 'China') & (df['Date'] == '2020-04-27')]

Unnamed: 0,Country,DisplayName,Latitude,Longitude,ISO2Code,ISO3Code,ISONumCode,TotalPopulation,Continent,Date,CaseType,DailyReportedCases,CumulativeReportedCases,DailyReportedCasesPer100K,CumulativeReportedCasesPer100K,GrowthFactor,GF_RollingGeomMean,DoublingTime
4176,China,China,1083.3367,3684.4197,CN,CHN,156,1427648000.0,Asia,2020-04-27,confirmed,6,83918,0.00042,5.87806,1.000072,1.000172,4028.986129
25698,China,China,1083.3367,3684.4197,CN,CHN,156,1427648000.0,Asia,2020-04-27,death,0,4637,0.0,0.3248,1.0,1.000031,22496.438231
47220,China,China,1083.3367,3684.4197,CN,CHN,156,1427648000.0,Asia,2020-04-27,recovered,97,78374,0.006794,5.489729,1.001239,1.001152,602.138326
