# Covid-19 Human DataDatasets Ingestion

In [1]:
import numpy as np
import pandas as pd
import requests

import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
import locale

locale_ingest_str = 'en_EN.UTF-8'
locale.setlocale(locale.LC_ALL, locale_ingest_str)

date_format_ingest_raw = '%d/%m/%Y'
date_format_ingest_std = '%Y-%m-%d'

## Datasets Description

- Time series of confirmed cases for every country. 1 row per country, 1 column per day.
- Time series of deaths for every country. 1 row per country, 1 column per day.
- Time series of recovered for every country. 1 row per country, 1 column per day.

## Aggregated Raw Datasets Download

### Confirmed Cases Dataset

The dataset can be previewed ["here"](https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases/resource/00fa0e37-961b-4767-a5ce-e7ab4e2c921c).

The dataset can be found ["here"](https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv).

In [26]:
# Download Raw Dataset file
url = 'https://data.humdata.org/hxlproxy/api/data-preview.csv?url=https%3A%2F%2Fraw.githubusercontent.com%2FCSSEGISandData%2FCOVID-19%2Fmaster%2Fcsse_covid_19_data%2Fcsse_covid_19_time_series%2Ftime_series_covid19_confirmed_global.csv&filename=time_series_covid19_confirmed_global.csv'
req = requests.get(url, allow_redirects = True)
f = open( '../../data/raw/Covid19AggCasesHDX.csv', 'wb')
f.write(req.content)
f.close()

# TODO: Check for file not found and empty file 

In [32]:
# Columns names for renaming.
column_names = ['country_str', 'region_str', 'date_str', 'num_cases_str']

# Raw Datased load and show
df_covid19_agg_cases = pd.read_csv('../../data/raw/Covid19AggCasesHDX.csv', encoding = 'UTF-8', sep = ',',  quotechar = "\"", na_filter = False, low_memory = False)

In [31]:
df_covid19_agg_cases

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20
0,,Afghanistan,33.000000,65.000000,0,0,0,0,0,0,...,607,665,714,784,840,906,933,996,1026,1092
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,446,467,475,494,518,539,548,562,584,609
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,1914,1983,2070,2160,2268,2418,2534,2629,2718,2811
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,638,646,659,673,673,696,704,713,717,717
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,19,19,19,19,19,19,24,24,24,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,Saint Pierre and Miquelon,France,46.885200,-56.315900,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
260,,South Sudan,6.877000,31.307000,0,0,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4
261,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,6,6,6,6,6,6,6,6,6,6
262,,Sao Tome and Principe,0.186360,6.613081,0,0,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4


In [68]:
idvars = list(df_covid19_agg_cases.columns[ :4])
valuevars = list(df_covid19_agg_cases.columns[4: ])

In [69]:
idvars

['Province/State', 'Country/Region', 'Lat', 'Long']

In [70]:
valuevars

['1/22/20',
 '1/23/20',
 '1/24/20',
 '1/25/20',
 '1/26/20',
 '1/27/20',
 '1/28/20',
 '1/29/20',
 '1/30/20',
 '1/31/20',
 '2/1/20',
 '2/2/20',
 '2/3/20',
 '2/4/20',
 '2/5/20',
 '2/6/20',
 '2/7/20',
 '2/8/20',
 '2/9/20',
 '2/10/20',
 '2/11/20',
 '2/12/20',
 '2/13/20',
 '2/14/20',
 '2/15/20',
 '2/16/20',
 '2/17/20',
 '2/18/20',
 '2/19/20',
 '2/20/20',
 '2/21/20',
 '2/22/20',
 '2/23/20',
 '2/24/20',
 '2/25/20',
 '2/26/20',
 '2/27/20',
 '2/28/20',
 '2/29/20',
 '3/1/20',
 '3/2/20',
 '3/3/20',
 '3/4/20',
 '3/5/20',
 '3/6/20',
 '3/7/20',
 '3/8/20',
 '3/9/20',
 '3/10/20',
 '3/11/20',
 '3/12/20',
 '3/13/20',
 '3/14/20',
 '3/15/20',
 '3/16/20',
 '3/17/20',
 '3/18/20',
 '3/19/20',
 '3/20/20',
 '3/21/20',
 '3/22/20',
 '3/23/20',
 '3/24/20',
 '3/25/20',
 '3/26/20',
 '3/27/20',
 '3/28/20',
 '3/29/20',
 '3/30/20',
 '3/31/20',
 '4/1/20',
 '4/2/20',
 '4/3/20',
 '4/4/20',
 '4/5/20',
 '4/6/20',
 '4/7/20',
 '4/8/20',
 '4/9/20',
 '4/10/20',
 '4/11/20',
 '4/12/20',
 '4/13/20',
 '4/14/20',
 '4/15/20',
 '4/16/

In [71]:
df_covid19_agg_cases.melt(id_vars = idvars, value_vars = valuevars)

Unnamed: 0,Province/State,Country/Region,Lat,Long,variable,value
0,,Afghanistan,33.000000,65.000000,1/22/20,0
1,,Albania,41.153300,20.168300,1/22/20,0
2,,Algeria,28.033900,1.659600,1/22/20,0
3,,Andorra,42.506300,1.521800,1/22/20,0
4,,Angola,-11.202700,17.873900,1/22/20,0
...,...,...,...,...,...,...
24019,Saint Pierre and Miquelon,France,46.885200,-56.315900,4/21/20,1
24020,,South Sudan,6.877000,31.307000,4/21/20,4
24021,,Western Sahara,24.215500,-12.885800,4/21/20,6
24022,,Sao Tome and Principe,0.186360,6.613081,4/21/20,4


In [57]:
[list(df_covid19_agg_cases.columns[: 4])]

[['Province/State', 'Country/Region', 'Lat', 'Long']]

## Standard Datasets Generation

### Aggregated Raw Dataset Validation

Remove invalid bottom rows due to dataset comments. 

In [5]:
# Detete from dataset rows with empty values in key columns (1st or 2nd columns).
df_covid19_agg = df_covid19_agg.mask(df_covid19_agg.eq('')).dropna(axis = 0, how = 'any', subset = ['region_str', 'date_str'], inplace = False)

In [6]:
# Replace NaN values with 0
df_covid19_agg = df_covid19_agg.fillna(0, axis = 0, inplace = False)

In [7]:
df_covid19_agg

Unnamed: 0,region_str,date_str,num_cases_str,num_hosp_str,num_icu_str,num_deaths_str,num_recov_str
0,AN,20/2/2020,0,0,0,0,0
1,AR,20/2/2020,0,0,0,0,0
2,AS,20/2/2020,0,0,0,0,0
3,IB,20/2/2020,1,0,0,0,0
4,CN,20/2/2020,1,0,0,0,0
...,...,...,...,...,...,...,...
1173,ML,21/4/2020,105,44,3,2,50
1174,MC,21/4/2020,1695,622,104,123,761
1175,NC,21/4/2020,4899,1894,128,401,1316
1176,PV,21/4/2020,13044,6201,513,1124,7651


In [8]:
# Defining data conversion and extraction functions

# Region extraction function
def data_convert_region(value_str):
    # TODO: check against a list of provinces
    if value_str == '':
        return None
    try:
        return str(value_str)
    except:
        return None

# Data conversion function
def data_convert_date(value_str, date_format_origin, date_format_target):
    try:
        return pd.to_datetime(str(value_str), format = date_format_origin).strftime(date_format_target)
    except:
        return None

# Total conversion function
def data_convert_total(value_str):
    if value_str == '':
        return "0"
    try:
        return str(int(str(value_str)))
    except:
        return None

In [9]:
# Applying conversion and extraction functions to dataset.

df_covid19_agg['region'] = df_covid19_agg['region_str'].apply(lambda x: data_convert_region(x))
df_covid19_agg['date'] = df_covid19_agg['date_str'].apply(lambda x: data_convert_date(x, date_format_ingest_raw, date_format_ingest_std))
df_covid19_agg['num_cases'] = df_covid19_agg['num_cases_str'].apply(lambda x: data_convert_total(x))
df_covid19_agg['num_hosp'] = df_covid19_agg['num_hosp_str'].apply(lambda x: data_convert_total(x))
df_covid19_agg['num_icu'] = df_covid19_agg['num_icu_str'].apply(lambda x: data_convert_total(x))
df_covid19_agg['num_deaths'] = df_covid19_agg['num_deaths_str'].apply(lambda x: data_convert_total(x))
df_covid19_agg['num_recov'] = df_covid19_agg['num_recov_str'].apply(lambda x: data_convert_total(x))

In [10]:
# None values mark rows with standarization problems.
# Check for None values in all columns.

# Erros on province column.
prov_ko_count = len(df_covid19_agg[df_covid19_agg['region'] == None])
if prov_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'region_str\' column.'.format(sex_ko_count))

# Erros on date column.
date_ko_count = len(df_covid19_agg[df_covid19_agg['date'] == None])
if date_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'date_str\' column.'.format(date_ko_count))
    
# Erros on num_cases column.
num_cases_ko_count = len(df_covid19_agg[df_covid19_agg['num_cases'] == None])
if num_cases_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'num_cases_str\' column.'.format(num_cases_ko_count))

# Erros on num_hosp column.
num_hosp_ko_count = len(df_covid19_agg[df_covid19_agg['num_hosp'] == None])
if num_hosp_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'num_hosp_str\' column.'.format(num_hosp_ko_count))

# Erros on num_icu column.
num_icu_ko_count = len(df_covid19_agg[df_covid19_agg['num_icu'] == None])
if num_icu_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'num_icu_str\' column.'.format(num_icu_ko_count))
    
# Erros on num_deaths column.
num_deaths_ko_count = len(df_covid19_agg[df_covid19_agg['num_deaths'] == None])
if num_deaths_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'num_deaths_str\' column.'.format(num_deaths_ko_count))
    
# Erros on num_recov column.
num_recov_ko_count = len(df_covid19_agg[df_covid19_agg['num_recov'] == None])
if num_recov_ko_count != 0: 
    sys.exit('Found {0} rows with incorrect values on \'num_recov_str\' column.'.format(num_recov_ko_count))

In [11]:
df_covid19_agg

Unnamed: 0,region_str,date_str,num_cases_str,num_hosp_str,num_icu_str,num_deaths_str,num_recov_str,region,date,num_cases,num_hosp,num_icu,num_deaths,num_recov
0,AN,20/2/2020,0,0,0,0,0,AN,2020-02-20,0,0,0,0,0
1,AR,20/2/2020,0,0,0,0,0,AR,2020-02-20,0,0,0,0,0
2,AS,20/2/2020,0,0,0,0,0,AS,2020-02-20,0,0,0,0,0
3,IB,20/2/2020,1,0,0,0,0,IB,2020-02-20,1,0,0,0,0
4,CN,20/2/2020,1,0,0,0,0,CN,2020-02-20,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,ML,21/4/2020,105,44,3,2,50,ML,2020-04-21,105,44,3,2,50
1174,MC,21/4/2020,1695,622,104,123,761,MC,2020-04-21,1695,622,104,123,761
1175,NC,21/4/2020,4899,1894,128,401,1316,NC,2020-04-21,4899,1894,128,401,1316
1176,PV,21/4/2020,13044,6201,513,1124,7651,PV,2020-04-21,13044,6201,513,1124,7651


### Conversion to Aggregated Standard Dataset  

In [12]:
# Applying type conversion functions to dataset.

df_covid19_agg['region'] = df_covid19_agg['region'].astype(str)
df_covid19_agg['date'] = df_covid19_agg['date'].apply(lambda x: pd.to_datetime(x, format = date_format_ingest_std))
df_covid19_agg['num_cases'] = df_covid19_agg['num_cases'].apply(int)
df_covid19_agg['num_hosp'] = df_covid19_agg['num_hosp'].apply(int)
df_covid19_agg['num_icu'] = df_covid19_agg['num_icu'].apply(int)
df_covid19_agg['num_deaths'] = df_covid19_agg['num_deaths'].apply(int)
df_covid19_agg['num_recov'] = df_covid19_agg['num_recov'].apply(int)

In [13]:
df_covid19_agg

Unnamed: 0,region_str,date_str,num_cases_str,num_hosp_str,num_icu_str,num_deaths_str,num_recov_str,region,date,num_cases,num_hosp,num_icu,num_deaths,num_recov
0,AN,20/2/2020,0,0,0,0,0,AN,2020-02-20,0,0,0,0,0
1,AR,20/2/2020,0,0,0,0,0,AR,2020-02-20,0,0,0,0,0
2,AS,20/2/2020,0,0,0,0,0,AS,2020-02-20,0,0,0,0,0
3,IB,20/2/2020,1,0,0,0,0,IB,2020-02-20,1,0,0,0,0
4,CN,20/2/2020,1,0,0,0,0,CN,2020-02-20,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,ML,21/4/2020,105,44,3,2,50,ML,2020-04-21,105,44,3,2,50
1174,MC,21/4/2020,1695,622,104,123,761,MC,2020-04-21,1695,622,104,123,761
1175,NC,21/4/2020,4899,1894,128,401,1316,NC,2020-04-21,4899,1894,128,401,1316
1176,PV,21/4/2020,13044,6201,513,1124,7651,PV,2020-04-21,13044,6201,513,1124,7651


In [14]:
# Remove raw columns
df_covid19_agg.drop(column_names, axis = 1, inplace = True)

In [15]:
df_covid19_agg

Unnamed: 0,region,date,num_cases,num_hosp,num_icu,num_deaths,num_recov
0,AN,2020-02-20,0,0,0,0,0
1,AR,2020-02-20,0,0,0,0,0
2,AS,2020-02-20,0,0,0,0,0
3,IB,2020-02-20,1,0,0,0,0
4,CN,2020-02-20,1,0,0,0,0
...,...,...,...,...,...,...,...
1173,ML,2020-04-21,105,44,3,2,50
1174,MC,2020-04-21,1695,622,104,123,761
1175,NC,2020-04-21,4899,1894,128,401,1316
1176,PV,2020-04-21,13044,6201,513,1124,7651


In [16]:
# Order dataset by 'region' and 'date'
df_covid19_agg.sort_values(['region','date'], inplace = True)

# Saving standard dataset with aggregated values
df_covid19_agg.to_csv('../../data/standard/Covid19AggSP.csv', index = False)

### Incremental Standard Dataset Generation

In [17]:
# Get list of regions
list_regions = sorted(list(df_covid19_agg['region'].unique()))
df_covid19_inc = dfObj = pd.DataFrame(columns = df_covid19_agg.columns)

for region in list_regions:
    df_covid19_agg_region = df_covid19_agg[df_covid19_agg['region'] == region].drop('region', axis = 1)
    df_covid19_inc_region = df_covid19_agg_region.diff(axis = 0)    
    df_covid19_inc_region['date'] = df_covid19_agg_region['date']
    df_covid19_inc_region = df_covid19_inc_region.iloc[1:]    
    df_covid19_inc_region.insert(0, 'region', region)
    
    df_covid19_inc = pd.concat([df_covid19_inc, df_covid19_inc_region])
    
# Convert values from float to int
df_covid19_inc['num_cases'] = df_covid19_inc['num_cases'].apply(int)
df_covid19_inc['num_hosp'] = df_covid19_inc['num_hosp'].apply(int)
df_covid19_inc['num_icu'] = df_covid19_inc['num_icu'].apply(int)
df_covid19_inc['num_deaths'] = df_covid19_inc['num_deaths'].apply(int)
df_covid19_inc['num_recov'] = df_covid19_inc['num_recov'].apply(int)

In [18]:
df_covid19_inc

Unnamed: 0,region,date,num_cases,num_hosp,num_icu,num_deaths,num_recov
19,AN,2020-02-21,0,0,0,0,0
38,AN,2020-02-22,0,0,0,0,0
57,AN,2020-02-23,0,0,0,0,0
76,AN,2020-02-24,0,0,0,0,0
95,AN,2020-02-25,0,0,0,0,0
...,...,...,...,...,...,...,...
1093,VC,2020-04-17,76,103,4,43,282
1112,VC,2020-04-18,142,73,7,23,241
1131,VC,2020-04-19,81,46,3,14,82
1150,VC,2020-04-20,66,39,2,10,90


In [19]:
# Order dataset by 'region' and 'date'
df_covid19_inc.sort_values(['region','date'], inplace = True)

# Saving standard dataset with incremental values
df_covid19_inc.to_csv('../../data/standard/Covid19IncSP.csv', index = False)