In [2]:
import pandas as pd
import os.path  as osp

In [28]:
from ddf_utils.factory.common import download

In [29]:
codebook_url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-codebook.csv'
data_url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'

In [30]:
download(codebook_url, '../source/owid-covid-codebook.csv')

In [31]:
download(data_url, '../source/owid-covid-data.csv')

the http status code is 416, possibly the download was completed.
if you believe it's not completed, please remove the file and try again.


In [6]:
# source_file = '../source/owid-covid-data.xlsx'
source_file = '../source/owid-covid-data.csv'

In [8]:
df = pd.read_csv(source_file)

In [9]:
df.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [10]:
df.iloc[:5, :5]

Unnamed: 0,iso_code,continent,location,date,total_cases
0,AFG,Asia,Afghanistan,2020-02-24,5.0
1,AFG,Asia,Afghanistan,2020-02-25,5.0
2,AFG,Asia,Afghanistan,2020-02-26,5.0
3,AFG,Asia,Afghanistan,2020-02-27,5.0
4,AFG,Asia,Afghanistan,2020-02-28,5.0


In [11]:
locations = df[['iso_code', 'continent', 'location']].copy()

In [12]:
locations.drop_duplicates()

Unnamed: 0,iso_code,continent,location
0,AFG,Asia,Afghanistan
648,OWID_AFR,,Africa
1307,ALB,Europe,Albania
1954,DZA,Africa,Algeria
2601,AND,Europe,Andorra
...,...,...,...
134599,WLF,Oceania,Wallis and Futuna
134852,OWID_WRL,,World
135533,YEM,Asia,Yemen
136135,ZMB,Africa,Zambia


In [13]:
locations.columns = ['iso_code', 'continent', 'name']
locations['location'] = locations['iso_code'].str.lower()

In [14]:
locations = locations.drop_duplicates()

In [15]:
locations = locations.set_index('location').sort_index()

In [16]:
locations

Unnamed: 0_level_0,iso_code,continent,name
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abw,ABW,North America,Aruba
afg,AFG,Asia,Afghanistan
ago,AGO,Africa,Angola
aia,AIA,North America,Anguilla
alb,ALB,Europe,Albania
...,...,...,...
wsm,WSM,Oceania,Samoa
yem,YEM,Asia,Yemen
zaf,ZAF,Africa,South Africa
zmb,ZMB,Africa,Zambia


In [17]:
locations.to_csv('../../ddf--entities--location.csv')

In [18]:
datapoints = df.drop(['continent', 'location'], axis=1).copy()

In [19]:
datapoints.dtypes['date']

dtype('O')

In [20]:
datapoints['location'] = datapoints['iso_code'].str.lower()

In [21]:
datapoints['date_'] = datapoints['date'].map(lambda x: x.replace('-', ''))

In [22]:
datapoints = datapoints.drop(['iso_code', 'date'], axis=1)

In [23]:
datapoints = datapoints.set_index(['location', 'date_'])

In [24]:
datapoints.iloc[:5, :5]

Unnamed: 0_level_0,Unnamed: 1_level_0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths
location,date_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
afg,20200224,5.0,5.0,,,
afg,20200225,5.0,0.0,,,
afg,20200226,5.0,0.0,,,
afg,20200227,5.0,0.0,,,
afg,20200228,5.0,0.0,,,


In [25]:
datapoints.index.names = ['location', 'date']

In [44]:
for c in datapoints.columns:
    fname = osp.join('../../', f'ddf--datapoints--{c}--by--location--date.csv')
    datapoints[c].dropna().to_csv(fname)

In [32]:
codebook = pd.read_csv('../source/owid-covid-codebook.csv')

In [33]:
codebook

Unnamed: 0,column,source,category,description
0,iso_code,International Organization for Standardization,Others,ISO 3166-1 alpha-3 – three-letter country codes
1,continent,Our World in Data,Others,Continent of the geographical location
2,location,Our World in Data,Others,Geographical location
3,date,Our World in Data,Others,Date of observation
4,total_cases,COVID-19 Data Repository by the Center for Sys...,Confirmed cases,Total confirmed cases of COVID-19
...,...,...,...,...
62,human_development_index,United Nations Development Programme (UNDP),Others,A composite index measuring average achievemen...
63,excess_mortality,"Human Mortality Database (2021), World Mortali...",Excess mortality,Percentage difference between the reported num...
64,excess_mortality_cumulative,"Human Mortality Database (2021), World Mortali...",Excess mortality,Percentage difference between the cumulative n...
65,excess_mortality_cumulative_absolute,"Human Mortality Database (2021), World Mortali...",Excess mortality,Cumulative difference between the reported num...


In [35]:
concepts = codebook.copy()

In [36]:
concepts = concepts.set_index('column')

In [37]:
concepts['concept_type'] = 'measure'
concepts.loc['location', 'concept_type'] = 'entity_domain'
concepts.loc['date', 'concept_type'] = 'time'
concepts['name'] = concepts.index

In [48]:
concepts.loc['name', ['name', 'concept_type']] = ['Name', 'string']
concepts.loc['category', ['name', 'concept_type']] = ['Category', 'string']
concepts.loc['description', ['name', 'concept_type']] = ['Description', 'string']
concepts.loc['source', ['name', 'concept_type']] = ['Source', 'string']

In [39]:
concepts.loc['iso_code', 'concept_type'] = 'string'
concepts.loc['continent', 'concept_type'] = 'string'

In [46]:
concepts

Unnamed: 0_level_0,source,category,description,concept_type,name
concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
iso_code,International Organization for Standardization,Others,ISO 3166-1 alpha-3 – three-letter country codes,string,iso_code
continent,Our World in Data,Others,Continent of the geographical location,string,continent
location,Our World in Data,Others,Geographical location,entity_domain,location
date,Our World in Data,Others,Date of observation,time,date
total_cases,COVID-19 Data Repository by the Center for Sys...,Confirmed cases,Total confirmed cases of COVID-19,measure,total_cases
...,...,...,...,...,...
excess_mortality_cumulative_absolute,"Human Mortality Database (2021), World Mortali...",Excess mortality,Cumulative difference between the reported num...,measure,excess_mortality_cumulative_absolute
excess_mortality_cumulative_per_million,"Human Mortality Database (2021), World Mortali...",Excess mortality,Cumulative difference between the reported num...,measure,excess_mortality_cumulative_per_million
name,,,,string,Name
category,,,,string,Category


In [40]:
concepts.index.name = 'concept'

In [49]:
concepts.to_csv('../../ddf--concepts.csv')