In [1]:
import pandas as pd


## Get data from Johns Hopkins

In [2]:
#!git clone https://github.com/CSSEGISandData/COVID-19.git 
!cd COVID-19 && git pull origin master

From https://github.com/CSSEGISandData/COVID-19
 * branch            master     -> FETCH_HEAD
Already up to date.


In [3]:
df_corona_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
df_corona_death_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")

### retreat data structure

In [4]:
def retreat_struct(df):
    output = df.groupby(by="Country/Region").sum()
    output.drop(['Lat','Long'], axis=1, inplace=True)
    output.fillna(0, inplace=True)
    output.index.name = None
    output.columns = pd.to_datetime(output.columns)
    output.rename(index={'Mainland China':'China', 'Korea, South': 'South Korea'},inplace=True)
    return output


In [5]:
df_corona_temp = retreat_struct(df_corona_temp)
df_corona_death_temp = retreat_struct(df_corona_death_temp)

In [7]:
df_corona_temp.shape, df_corona_death_temp.shape

((147, 54), (147, 54))

### Transform colums as lines

In [8]:
serie_corona = df_corona_temp.stack()
serie_death = df_corona_death_temp.stack()

In [9]:
# back to df 
df_corona = pd.DataFrame(serie_corona)

df_corona.reset_index(inplace=True)
df_corona.columns = ['country', 'date','case']

In [10]:
df_corona = df_corona.join(pd.DataFrame(serie_death),how='left',on=['country','date'])
df_corona.columns = ['country', 'date','case','death']

In [11]:
# Checking data 
df_corona.head()

Unnamed: 0,country,date,case,death
0,Afghanistan,2020-01-22,0,0
1,Afghanistan,2020-01-23,0,0
2,Afghanistan,2020-01-24,0,0
3,Afghanistan,2020-01-25,0,0
4,Afghanistan,2020-01-26,0,0


## UN Data on population and density

### Alignment of data

In [12]:
def align_country_names(df):
    ''' Align country names with CSSE reference'''
    output = df.rename(columns={'Region, subregion, country or area *' : 'Country'})
    output.set_index("Country", inplace=True)
    output.rename(index={'Republic of Korea' : 'South Korea', 'Iran (Islamic Republic of)': 'Iran', 'United States of America' : 'US', 'China, Taiwan Province of China' :'Taiwan','Russian Federation':'Russia','Viet Nam':'Vietnam'}, inplace=True)
    # focus only at country level data
    output = output[output['Type'] == 'Country/Area']
    # return 2020 data olny
    return output['2020']


### Population

In [13]:
df_wp_temp = pd.read_excel("./UN data/population_world.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16))

In [14]:
serie_wp = align_country_names(df_wp_temp)
#check data
serie_wp

Country
Burundi                      11890.8
Comoros                      869.595
Djibouti                     988.002
Eritrea                      3546.43
Ethiopia                      114964
                              ...   
Bermuda                       62.273
Canada                       37742.2
Greenland                     56.772
Saint Pierre and Miquelon      5.795
US                            331003
Name: 2020, Length: 235, dtype: object

### Density data

In [15]:
serie_density = align_country_names(pd.read_excel("./UN data/WPP2019_POP_F06_POPULATION_DENSITY.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16)))

In [16]:
# Check data
serie_density

Country
Burundi                       463.037
Comoros                       467.273
Djibouti                       42.623
Eritrea                       35.1131
Ethiopia                      114.964
                               ...   
Bermuda                       1245.46
Canada                        4.15045
Greenland                    0.138316
Saint Pierre and Miquelon     25.1957
US                            36.1854
Name: 2020, Length: 235, dtype: object

### Country infos
Crossing population and density and calculating size

In [17]:
df_1 = pd.DataFrame(serie_wp)
df_1.columns = ['population']
df_2 = pd.DataFrame(serie_density)
df_2.columns =['density']

In [18]:
# join two series as df
df_country_info = df_1.join(df_2)
# Calculate size of countries
df_country_info['size'] =  df_country_info['population']/df_country_info['density'] * 1000

In [19]:
# Cehcking data
df_country_info

Unnamed: 0_level_0,population,density,size
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burundi,11890.8,463.037,25680
Comoros,869.595,467.273,1861
Djibouti,988.002,42.623,23180
Eritrea,3546.43,35.1131,101000
Ethiopia,114964,114.964,1e+06
...,...,...,...
Bermuda,62.273,1245.46,50
Canada,37742.2,4.15045,9.09351e+06
Greenland,56.772,0.138316,410450
Saint Pierre and Miquelon,5.795,25.1957,230


### Saving the UN data

In [20]:
df_country_info.to_csv('country_info.csv')

## Cross data CSSE and pop / density / size

In [21]:
df_dataset = df_corona.join(df_country_info, how='left',on='country')

In [22]:
df_dataset

Unnamed: 0,country,date,case,death,population,density,size
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860
...,...,...,...,...,...,...,...
7933,occupied Palestinian territory,2020-03-11,0,0,,,
7934,occupied Palestinian territory,2020-03-12,0,0,,,
7935,occupied Palestinian territory,2020-03-13,0,0,,,
7936,occupied Palestinian territory,2020-03-14,0,0,,,


In [23]:
## Calculate the age of crisis

In [24]:
ratio_crisis = 0.5
death_crisis = 20

In [25]:
df_dataset['ratio'] = df_dataset['case'] / df_dataset['population'] * 100
df_dataset['ratio_death'] = df_dataset['death'] / df_dataset['population'] * 100

In [26]:
# check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860,0,0
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860,0,0
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860,0,0
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860,0,0
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860,0,0
...,...,...,...,...,...,...,...,...,...
7933,occupied Palestinian territory,2020-03-11,0,0,,,,,
7934,occupied Palestinian territory,2020-03-12,0,0,,,,,
7935,occupied Palestinian territory,2020-03-13,0,0,,,,,
7936,occupied Palestinian territory,2020-03-14,0,0,,,,,


### calculate ref data for crisis

In [27]:
serie_epidemic_start_date = df_dataset[df_dataset['ratio'] > ratio_crisis].groupby('country').min()['date']
serie_epidemic_death_start_date = df_dataset[df_dataset['ratio_death'] > death_crisis].groupby('country').min()['date']

In [28]:
# cases
df_epidemic_start = pd.DataFrame(serie_epidemic_start_date)
df_epidemic_start.columns = ['start_date']
# deaths
df_epidemic_death_start = pd.DataFrame(serie_epidemic_death_start_date)
df_epidemic_death_start.columns = ['start_death_date']

### add start date to dataset

In [29]:
df_dataset = df_dataset.join(df_epidemic_start, how='right', on='country')
df_dataset = df_dataset.join(df_epidemic_death_start, how='left', on='country')

In [32]:
df_dataset['age'] = (df_dataset['date'] - df_dataset['start_date']).dt.days
df_dataset['age_death'] = (df_dataset['date'] - df_dataset['start_death_date']).dt.days

In [33]:
#check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death,start_date,start_death_date,age,age_death
54,Albania,2020-01-22,0,0,2877.8,105.029,27400,0,0,2020-03-12,NaT,-50,
55,Albania,2020-01-23,0,0,2877.8,105.029,27400,0,0,2020-03-12,NaT,-49,
56,Albania,2020-01-24,0,0,2877.8,105.029,27400,0,0,2020-03-12,NaT,-48,
57,Albania,2020-01-25,0,0,2877.8,105.029,27400,0,0,2020-03-12,NaT,-47,
58,Albania,2020-01-26,0,0,2877.8,105.029,27400,0,0,2020-03-12,NaT,-46,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,United Kingdom,2020-03-11,459,8,67886,280.602,241930,0.676133,0.0117845,2020-03-10,NaT,1,
7664,United Kingdom,2020-03-12,459,8,67886,280.602,241930,0.676133,0.0117845,2020-03-10,NaT,2,
7665,United Kingdom,2020-03-13,801,8,67886,280.602,241930,1.17992,0.0117845,2020-03-10,NaT,3,
7666,United Kingdom,2020-03-14,1143,21,67886,280.602,241930,1.6837,0.0309342,2020-03-10,NaT,4,


## Saving final data

In [34]:
df_dataset.to_csv('dataset.csv')