In [103]:
import pandas as pd


## Get data from Johns Hopkins

In [104]:
#!git clone https://github.com/CSSEGISandData/COVID-19.git 
!cd COVID-19 && git pull origin master

From https://github.com/CSSEGISandData/COVID-19
 * branch            master     -> FETCH_HEAD
Already up to date.


In [137]:
df_corona_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")

### retreat data structure

In [138]:
df_corona_temp = df_corona_temp.groupby(by="Country/Region").sum()
df_corona_temp.drop(['Lat','Long'], axis=1, inplace=True)
df_corona_temp.fillna(0, inplace=True)
df_corona_temp.index.name = None
df_corona_temp.columns = pd.to_datetime(df_corona_temp.columns)
df_corona_temp.rename(index={'Mainland China':'China', 'Korea, South': 'South Korea'},inplace=True)
df_corona_temp.columns = 

In [140]:
df_corona_temp.shape

(125, 52)

### Transform colums as lines

In [141]:
serie_corona = df_corona_temp.stack()

In [150]:
# back to df 
df_corona = pd.DataFrame(serie_corona)

df_corona.reset_index(inplace=True)
df_corona.columns = ['country', 'date','cases']

In [151]:
# Checking data 
df_corona.head()

Unnamed: 0,country,date,cases
0,Afghanistan,2020-01-22,0
1,Afghanistan,2020-01-23,0
2,Afghanistan,2020-01-24,0
3,Afghanistan,2020-01-25,0
4,Afghanistan,2020-01-26,0


## UN Data on population and density

### Alignment of data

In [157]:
def align_country_names(df):
    ''' Align country names with CSSE reference'''
    output = df.rename(columns={'Region, subregion, country or area *' : 'Country'})
    output.set_index("Country", inplace=True)
    output.rename(index={'Republic of Korea' : 'South Korea', 'Iran (Islamic Republic of)': 'Iran', 'United States of America' : 'US', 'China, Taiwan Province of China' :'Taiwan','Russian Federation':'Russia','Viet Nam':'Vietnam'}, inplace=True)
    # focus only at country level data
    output = output[output['Type'] == 'Country/Area']
    # return 2020 data olny
    return output['2020']


### Population

In [158]:
df_wp_temp = pd.read_excel("./UN data/population_world.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16))

In [159]:
serie_wp = align_country_names(df_wp_temp)
#check data
serie_wp

Country
Burundi                      11890.8
Comoros                      869.595
Djibouti                     988.002
Eritrea                      3546.43
Ethiopia                      114964
                              ...   
Bermuda                       62.273
Canada                       37742.2
Greenland                     56.772
Saint Pierre and Miquelon      5.795
US                            331003
Name: 2020, Length: 235, dtype: object

### Density data

In [160]:
serie_density = align_country_names(pd.read_excel("./UN data/WPP2019_POP_F06_POPULATION_DENSITY.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16)))

In [161]:
# Check data
serie_density

Country
Burundi                       463.037
Comoros                       467.273
Djibouti                       42.623
Eritrea                       35.1131
Ethiopia                      114.964
                               ...   
Bermuda                       1245.46
Canada                        4.15045
Greenland                    0.138316
Saint Pierre and Miquelon     25.1957
US                            36.1854
Name: 2020, Length: 235, dtype: object

### Country infos
Crossing population and density and calculating size

In [162]:
df_1 = pd.DataFrame(serie_wp)
df_1.columns = ['population']
df_2 = pd.DataFrame(serie_density)
df_2.columns =['density']

In [163]:
# join two series as df
df_country_info = df_1.join(df_2)
# Calculate size of countries
df_country_info['size'] =  df_country_info['population']/df_country_info['density'] * 1000

In [164]:
# Cehcking data
df_country_info

Unnamed: 0_level_0,population,density,size
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burundi,11890.8,463.037,25680
Comoros,869.595,467.273,1861
Djibouti,988.002,42.623,23180
Eritrea,3546.43,35.1131,101000
Ethiopia,114964,114.964,1e+06
...,...,...,...
Bermuda,62.273,1245.46,50
Canada,37742.2,4.15045,9.09351e+06
Greenland,56.772,0.138316,410450
Saint Pierre and Miquelon,5.795,25.1957,230


### Saving the UN data

In [168]:
df_country_info.to_csv('country_info.csv')

## Cross data CSSE and pop / density / size

In [166]:
df_dataset = df_corona.join(df_country_info, how='left',on='country')

In [167]:
df_dataset

Unnamed: 0,country,date,cases,population,density,size
0,Afghanistan,2020-01-22,0,38928.3,59.6274,652860
1,Afghanistan,2020-01-23,0,38928.3,59.6274,652860
2,Afghanistan,2020-01-24,0,38928.3,59.6274,652860
3,Afghanistan,2020-01-25,0,38928.3,59.6274,652860
4,Afghanistan,2020-01-26,0,38928.3,59.6274,652860
...,...,...,...,...,...,...
6495,Vietnam,2020-03-09,30,97338.6,313.925,310070
6496,Vietnam,2020-03-10,31,97338.6,313.925,310070
6497,Vietnam,2020-03-11,38,97338.6,313.925,310070
6498,Vietnam,2020-03-12,39,97338.6,313.925,310070


In [171]:
## Calculate the age of crisis

In [172]:
ratio_crisis = 0.5

In [175]:
df_dataset['ratio'] = df_dataset['cases'] / df_dataset['population'] * 100

In [176]:
# check data
df_dataset

Unnamed: 0,country,date,cases,population,density,size,ratio
0,Afghanistan,2020-01-22,0,38928.3,59.6274,652860,0
1,Afghanistan,2020-01-23,0,38928.3,59.6274,652860,0
2,Afghanistan,2020-01-24,0,38928.3,59.6274,652860,0
3,Afghanistan,2020-01-25,0,38928.3,59.6274,652860,0
4,Afghanistan,2020-01-26,0,38928.3,59.6274,652860,0
...,...,...,...,...,...,...,...
6495,Vietnam,2020-03-09,30,97338.6,313.925,310070,0.0308203
6496,Vietnam,2020-03-10,31,97338.6,313.925,310070,0.0318476
6497,Vietnam,2020-03-11,38,97338.6,313.925,310070,0.039039
6498,Vietnam,2020-03-12,39,97338.6,313.925,310070,0.0400663


In [177]:
### calculate ref data for crisis

In [178]:
serie_epidemic_start_date = df_dataset[df_dataset['ratio'] > ratio_crisis].groupby('country').min()['date']

In [179]:
df_epidemic_start = pd.DataFrame(serie_epidemic_start_date)
df_epidemic_start.columns = ['start_date']

In [180]:
### add start date to dataset

In [181]:
df_dataset = df_dataset.join(df_epidemic_start, how='right', on='country')

In [185]:
df_dataset['age'] = df_dataset['date'] - df_dataset['start_date']

In [186]:
#check data
df_dataset

Unnamed: 0,country,date,cases,population,density,size,ratio,start_date,age
52,Albania,2020-01-22,0,2877.8,105.029,27400,0,2020-03-12,-50 days
53,Albania,2020-01-23,0,2877.8,105.029,27400,0,2020-03-12,-49 days
54,Albania,2020-01-24,0,2877.8,105.029,27400,0,2020-03-12,-48 days
55,Albania,2020-01-25,0,2877.8,105.029,27400,0,2020-03-12,-47 days
56,Albania,2020-01-26,0,2877.8,105.029,27400,0,2020-03-12,-46 days
...,...,...,...,...,...,...,...,...,...
6443,United Kingdom,2020-03-09,322,67886,280.602,241930,0.474325,2020-03-10,-1 days
6444,United Kingdom,2020-03-10,384,67886,280.602,241930,0.565654,2020-03-10,0 days
6445,United Kingdom,2020-03-11,459,67886,280.602,241930,0.676133,2020-03-10,1 days
6446,United Kingdom,2020-03-12,459,67886,280.602,241930,0.676133,2020-03-10,2 days


## Saving final data

In [187]:
df_dataset.to_csv('dataset.csv')