In [1]:
import pandas as pd


## Get data from Johns Hopkins

In [7]:
#!git clone https://github.com/CSSEGISandData/COVID-19.git 
!cd COVID-19 && git pull origin master

Cloning into 'COVID-19'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 14702 (delta 0), reused 0 (delta 0), pack-reused 14697[K
Receiving objects: 100% (14702/14702), 47.10 MiB | 13.14 MiB/s, done.
Resolving deltas: 100% (7053/7053), done.


In [8]:
df_corona_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
df_corona_death_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")

### retreat data structure

In [9]:
def retreat_struct(df):
    output = df.groupby(by="Country/Region").sum()
    output.drop(['Lat','Long'], axis=1, inplace=True)
    output.fillna(0, inplace=True)
    output.index.name = None
    output.columns = pd.to_datetime(output.columns)
    output.rename(index={'Mainland China':'China', 'Korea, South': 'South Korea'},inplace=True)
    return output


In [10]:
df_corona_temp = retreat_struct(df_corona_temp)
df_corona_death_temp = retreat_struct(df_corona_death_temp)

In [11]:
df_corona_temp.shape, df_corona_death_temp.shape

((156, 55), (156, 55))

### Transform colums as lines

In [12]:
serie_corona = df_corona_temp.stack()
serie_death = df_corona_death_temp.stack()

In [13]:
# back to df 
df_corona = pd.DataFrame(serie_corona)

df_corona.reset_index(inplace=True)
df_corona.columns = ['country', 'date','case']

In [14]:
df_corona = df_corona.join(pd.DataFrame(serie_death),how='left',on=['country','date'])
df_corona.columns = ['country', 'date','case','death']

In [15]:
# Checking data 
df_corona.head()

Unnamed: 0,country,date,case,death
0,Afghanistan,2020-01-22,0,0
1,Afghanistan,2020-01-23,0,0
2,Afghanistan,2020-01-24,0,0
3,Afghanistan,2020-01-25,0,0
4,Afghanistan,2020-01-26,0,0


## UN Data on population and density

### Alignment of data

In [16]:
def align_country_names(df):
    ''' Align country names with CSSE reference'''
    output = df.rename(columns={'Region, subregion, country or area *' : 'Country'})
    output.set_index("Country", inplace=True)
    output.rename(index={'Republic of Korea' : 'South Korea', 'Iran (Islamic Republic of)': 'Iran', 'United States of America' : 'US', 'China, Taiwan Province of China' :'Taiwan','Russian Federation':'Russia','Viet Nam':'Vietnam'}, inplace=True)
    # focus only at country level data
    output = output[output['Type'] == 'Country/Area']
    # return 2020 data olny
    return output['2020']


### Population

In [23]:
df_wp_temp = pd.read_excel("./UN data/population_world.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16))

In [24]:
serie_wp = align_country_names(df_wp_temp)
#check data
serie_wp

Country
Burundi                      11890.8
Comoros                      869.595
Djibouti                     988.002
Eritrea                      3546.43
Ethiopia                      114964
                              ...   
Bermuda                       62.273
Canada                       37742.2
Greenland                     56.772
Saint Pierre and Miquelon      5.795
US                            331003
Name: 2020, Length: 235, dtype: object

### Density data

In [25]:
serie_density = align_country_names(pd.read_excel("./UN data/WPP2019_POP_F06_POPULATION_DENSITY.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16)))

In [26]:
# Check data
serie_density

Country
Burundi                       463.037
Comoros                       467.273
Djibouti                       42.623
Eritrea                       35.1131
Ethiopia                      114.964
                               ...   
Bermuda                       1245.46
Canada                        4.15045
Greenland                    0.138316
Saint Pierre and Miquelon     25.1957
US                            36.1854
Name: 2020, Length: 235, dtype: object

### Country infos
Crossing population and density and calculating size

In [27]:
df_1 = pd.DataFrame(serie_wp)
df_1.columns = ['population']
df_2 = pd.DataFrame(serie_density)
df_2.columns =['density']

In [28]:
# join two series as df
df_country_info = df_1.join(df_2)
# Calculate size of countries
df_country_info['size'] =  df_country_info['population']/df_country_info['density'] * 1000

In [29]:
# Cehcking data
df_country_info

Unnamed: 0_level_0,population,density,size
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burundi,11890.8,463.037,25680
Comoros,869.595,467.273,1861
Djibouti,988.002,42.623,23180
Eritrea,3546.43,35.1131,101000
Ethiopia,114964,114.964,1e+06
...,...,...,...
Bermuda,62.273,1245.46,50
Canada,37742.2,4.15045,9.09351e+06
Greenland,56.772,0.138316,410450
Saint Pierre and Miquelon,5.795,25.1957,230


### Saving the UN data

In [30]:
df_country_info.to_csv('country_info.csv')

## Cross data CSSE and pop / density / size

In [31]:
df_dataset = df_corona.join(df_country_info, how='left',on='country')

In [32]:
df_dataset

Unnamed: 0,country,date,case,death,population,density,size
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860
...,...,...,...,...,...,...,...
8575,occupied Palestinian territory,2020-03-12,0,0,,,
8576,occupied Palestinian territory,2020-03-13,0,0,,,
8577,occupied Palestinian territory,2020-03-14,0,0,,,
8578,occupied Palestinian territory,2020-03-15,0,0,,,


In [33]:
## Calculate the age of crisis

In [34]:
ratio_crisis = 0.5
death_crisis = 10

In [35]:
df_dataset['ratio'] = df_dataset['case'] / df_dataset['population'] * 100
df_dataset['ratio_death'] = df_dataset['death'] / df_dataset['population'] * 100

In [36]:
# check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860,0,0
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860,0,0
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860,0,0
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860,0,0
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860,0,0
...,...,...,...,...,...,...,...,...,...
8575,occupied Palestinian territory,2020-03-12,0,0,,,,,
8576,occupied Palestinian territory,2020-03-13,0,0,,,,,
8577,occupied Palestinian territory,2020-03-14,0,0,,,,,
8578,occupied Palestinian territory,2020-03-15,0,0,,,,,


### calculate ref data for crisis

In [37]:
serie_epidemic_start_date = df_dataset[df_dataset['ratio'] > ratio_crisis].groupby('country').min()['date']
serie_epidemic_death_start_date = df_dataset[df_dataset['death'] > death_crisis].groupby('country').min()['date']

In [38]:
serie_epidemic_death_start_date

country
China            2020-01-22
France           2020-03-07
Germany          2020-03-15
Iran             2020-02-24
Italy            2020-02-26
Japan            2020-03-11
Netherlands      2020-03-14
Philippines      2020-03-15
South Korea      2020-02-26
Spain            2020-03-08
Switzerland      2020-03-13
US               2020-03-04
United Kingdom   2020-03-14
Name: date, dtype: datetime64[ns]

In [39]:
# cases
df_epidemic_start = pd.DataFrame(serie_epidemic_start_date)
df_epidemic_start.columns = ['start_date']
# deaths
df_epidemic_death_start = pd.DataFrame(serie_epidemic_death_start_date)
df_epidemic_death_start.columns = ['start_death_date']

### add start date to dataset

In [40]:
df_dataset = df_dataset.join(df_epidemic_start, how='right', on='country')
df_dataset = df_dataset.join(df_epidemic_death_start, how='right', on='country')

In [41]:
df_dataset['age'] = (df_dataset['date'] - df_dataset['start_date']).dt.days
df_dataset['age_death'] = (df_dataset['date'] - df_dataset['start_death_date']).dt.days

In [42]:
#check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death,start_date,start_death_date,age,age_death
1540.0,China,2020-01-22,548.0,17.0,1.43932e+06,153.312,9.38821e+06,0.0380734,0.00118111,2020-01-30,2020-01-22,-8.0,0.0
1541.0,China,2020-01-23,643.0,18.0,1.43932e+06,153.312,9.38821e+06,0.0446738,0.00125059,2020-01-30,2020-01-22,-7.0,1.0
1542.0,China,2020-01-24,920.0,26.0,1.43932e+06,153.312,9.38821e+06,0.0639189,0.0018064,2020-01-30,2020-01-22,-6.0,2.0
1543.0,China,2020-01-25,1406.0,42.0,1.43932e+06,153.312,9.38821e+06,0.0976848,0.00291804,2020-01-30,2020-01-22,-5.0,3.0
1544.0,China,2020-01-26,2075.0,56.0,1.43932e+06,153.312,9.38821e+06,0.144165,0.00389072,2020-01-30,2020-01-22,-4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8301.0,United Kingdom,2020-03-13,802.0,8.0,67886,280.602,241930,1.18139,0.0117845,2020-03-10,2020-03-14,3.0,-1.0
8302.0,United Kingdom,2020-03-14,1144.0,21.0,67886,280.602,241930,1.68518,0.0309342,2020-03-10,2020-03-14,4.0,0.0
8303.0,United Kingdom,2020-03-15,1145.0,21.0,67886,280.602,241930,1.68665,0.0309342,2020-03-10,2020-03-14,5.0,1.0
8304.0,United Kingdom,2020-03-16,1551.0,56.0,67886,280.602,241930,2.28471,0.0824912,2020-03-10,2020-03-14,6.0,2.0


## Saving final data

In [43]:
df_dataset.to_csv('dataset.csv')