In [65]:
import pandas as pd


## Get data from Johns Hopkins

In [66]:
#!git clone https://github.com/CSSEGISandData/COVID-19.git 
!cd COVID-19 && git pull origin master

From https://github.com/CSSEGISandData/COVID-19
 * branch            master     -> FETCH_HEAD
Already up to date.


In [67]:
df_corona_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
df_corona_death_temp = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")

### retreat data structure

In [68]:
def retreat_struct(df):
    output = df.groupby(by="Country/Region").sum()
    output.drop(['Lat','Long'], axis=1, inplace=True)
    output.fillna(0, inplace=True)
    output.index.name = None
    output.columns = pd.to_datetime(output.columns)
    output.rename(index={'Mainland China':'China', 'Korea, South': 'South Korea'},inplace=True)
    return output


In [69]:
df_corona_temp = retreat_struct(df_corona_temp)
df_corona_death_temp = retreat_struct(df_corona_death_temp)

In [70]:
df_corona_temp.shape, df_corona_death_temp.shape

((155, 58), (155, 58))

### Transform colums as lines

In [71]:
serie_corona = df_corona_temp.stack()
serie_death = df_corona_death_temp.stack()

In [72]:
# back to df 
df_corona = pd.DataFrame(serie_corona)

df_corona.reset_index(inplace=True)
df_corona.columns = ['country', 'date','case']

In [73]:
df_corona = df_corona.join(pd.DataFrame(serie_death),how='left',on=['country','date'])
df_corona.columns = ['country', 'date','case','death']

In [74]:
# Checking data 
df_corona.head()

Unnamed: 0,country,date,case,death
0,Afghanistan,2020-01-22,0,0
1,Afghanistan,2020-01-23,0,0
2,Afghanistan,2020-01-24,0,0
3,Afghanistan,2020-01-25,0,0
4,Afghanistan,2020-01-26,0,0


## UN Data on population and density

### Alignment of data

In [75]:
def align_country_names(df):
    ''' Align country names with CSSE reference'''
    output = df.rename(columns={'Region, subregion, country or area *' : 'Country'})
    output.set_index("Country", inplace=True)
    output.rename(index={'Republic of Korea' : 'South Korea', 'Iran (Islamic Republic of)': 'Iran', 'United States of America' : 'US', 'China, Taiwan Province of China' :'Taiwan','Russian Federation':'Russia','Viet Nam':'Vietnam'}, inplace=True)
    # focus only at country level data
    output = output[output['Type'] == 'Country/Area']
    # return 2020 data olny
    return output['2020']


### Population

In [76]:
df_wp_temp = pd.read_excel("./UN data/population_world.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16))

In [77]:
serie_wp = align_country_names(df_wp_temp)
#check data
serie_wp

Country
Burundi                      11890.8
Comoros                      869.595
Djibouti                     988.002
Eritrea                      3546.43
Ethiopia                      114964
                              ...   
Bermuda                       62.273
Canada                       37742.2
Greenland                     56.772
Saint Pierre and Miquelon      5.795
US                            331003
Name: 2020, Length: 235, dtype: object

### Density data

In [78]:
serie_density = align_country_names(pd.read_excel("./UN data/WPP2019_POP_F06_POPULATION_DENSITY.xlsx", sheet_name = 'ESTIMATES', skiprows= range(0,16)))

In [79]:
# Check data
serie_density

Country
Burundi                       463.037
Comoros                       467.273
Djibouti                       42.623
Eritrea                       35.1131
Ethiopia                      114.964
                               ...   
Bermuda                       1245.46
Canada                        4.15045
Greenland                    0.138316
Saint Pierre and Miquelon     25.1957
US                            36.1854
Name: 2020, Length: 235, dtype: object

### Country infos
Crossing population and density and calculating size

In [80]:
df_1 = pd.DataFrame(serie_wp)
df_1.columns = ['population']
df_2 = pd.DataFrame(serie_density)
df_2.columns =['density']

In [81]:
# join two series as df
df_country_info = df_1.join(df_2)
# Calculate size of countries
df_country_info['size'] =  df_country_info['population']/df_country_info['density'] * 1000

In [82]:
# Cehcking data
df_country_info

Unnamed: 0_level_0,population,density,size
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burundi,11890.8,463.037,25680
Comoros,869.595,467.273,1861
Djibouti,988.002,42.623,23180
Eritrea,3546.43,35.1131,101000
Ethiopia,114964,114.964,1e+06
...,...,...,...
Bermuda,62.273,1245.46,50
Canada,37742.2,4.15045,9.09351e+06
Greenland,56.772,0.138316,410450
Saint Pierre and Miquelon,5.795,25.1957,230


### Saving the UN data

In [83]:
df_country_info.to_csv('country_info.csv')

## Cross data CSSE and pop / density / size

In [84]:
df_dataset = df_corona.join(df_country_info, how='left',on='country')

In [85]:
df_dataset

Unnamed: 0,country,date,case,death,population,density,size
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860
...,...,...,...,...,...,...,...
8985,Zambia,2020-03-15,0,0,18384,24.7299,743390
8986,Zambia,2020-03-16,0,0,18384,24.7299,743390
8987,Zambia,2020-03-17,0,0,18384,24.7299,743390
8988,Zambia,2020-03-18,2,0,18384,24.7299,743390


In [86]:
## Calculate the age of crisis

In [87]:
ratio_crisis = 0.5
death_crisis = 10

In [88]:
df_dataset['ratio'] = df_dataset['case'] / df_dataset['population'] * 100
df_dataset['ratio_death'] = df_dataset['death'] / df_dataset['population'] * 100

In [89]:
# check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death
0,Afghanistan,2020-01-22,0,0,38928.3,59.6274,652860,0,0
1,Afghanistan,2020-01-23,0,0,38928.3,59.6274,652860,0,0
2,Afghanistan,2020-01-24,0,0,38928.3,59.6274,652860,0,0
3,Afghanistan,2020-01-25,0,0,38928.3,59.6274,652860,0,0
4,Afghanistan,2020-01-26,0,0,38928.3,59.6274,652860,0,0
...,...,...,...,...,...,...,...,...,...
8985,Zambia,2020-03-15,0,0,18384,24.7299,743390,0,0
8986,Zambia,2020-03-16,0,0,18384,24.7299,743390,0,0
8987,Zambia,2020-03-17,0,0,18384,24.7299,743390,0,0
8988,Zambia,2020-03-18,2,0,18384,24.7299,743390,0.0108791,0


### calculate ref data for crisis

In [90]:
serie_epidemic_start_date = df_dataset[df_dataset['ratio'] > ratio_crisis].groupby('country').min()['date']
serie_epidemic_death_start_date = df_dataset[df_dataset['death'] > death_crisis].groupby('country').min()['date']

In [91]:
serie_epidemic_death_start_date

country
Belgium          2020-03-18
China            2020-01-22
France           2020-03-07
Germany          2020-03-15
Indonesia        2020-03-18
Iran             2020-02-24
Iraq             2020-03-17
Italy            2020-02-26
Japan            2020-03-11
Netherlands      2020-03-14
Philippines      2020-03-15
San Marino       2020-03-18
South Korea      2020-02-26
Spain            2020-03-08
Sweden           2020-03-19
Switzerland      2020-03-13
US               2020-03-04
United Kingdom   2020-03-14
Name: date, dtype: datetime64[ns]

In [92]:
# cases
df_epidemic_start = pd.DataFrame(serie_epidemic_start_date)
df_epidemic_start.columns = ['start_date']
# deaths
df_epidemic_death_start = pd.DataFrame(serie_epidemic_death_start_date)
df_epidemic_death_start.columns = ['start_death_date']

### add start date to dataset

In [93]:
df_dataset = df_dataset.join(df_epidemic_start, how='right', on='country')
df_dataset = df_dataset.join(df_epidemic_death_start, how='right', on='country')

In [94]:
df_dataset['age'] = (df_dataset['date'] - df_dataset['start_date']).dt.days
df_dataset['age_death'] = (df_dataset['date'] - df_dataset['start_death_date']).dt.days

In [95]:
#check data
df_dataset

Unnamed: 0,country,date,case,death,population,density,size,ratio,ratio_death,start_date,start_death_date,age,age_death
870.0,Belgium,2020-01-22,0.0,0.0,11589.6,382.748,30280,0,0,2020-03-06,2020-03-18,-44.0,-56.0
871.0,Belgium,2020-01-23,0.0,0.0,11589.6,382.748,30280,0,0,2020-03-06,2020-03-18,-43.0,-55.0
872.0,Belgium,2020-01-24,0.0,0.0,11589.6,382.748,30280,0,0,2020-03-06,2020-03-18,-42.0,-54.0
873.0,Belgium,2020-01-25,0.0,0.0,11589.6,382.748,30280,0,0,2020-03-06,2020-03-18,-41.0,-53.0
874.0,Belgium,2020-01-26,0.0,0.0,11589.6,382.748,30280,0,0,2020-03-06,2020-03-18,-40.0,-52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8698.0,United Kingdom,2020-03-18,2642.0,72.0,67886,280.602,241930,3.89182,0.10606,2020-03-10,2020-03-14,8.0,4.0
8699.0,United Kingdom,2020-03-19,2716.0,138.0,67886,280.602,241930,4.00082,0.203282,2020-03-10,2020-03-14,9.0,5.0
,Indonesia,NaT,,,,,,,,NaT,2020-03-18,,
,Iraq,NaT,,,,,,,,NaT,2020-03-17,,


## Saving final data

In [96]:
df_dataset.to_csv('dataset.csv')