# WHO GHE (2021-07-01)

In [1]:
dest_dir = "/tmp/ghe_20210701"

In [2]:
from owid import walden, catalog  # type: ignore
import tempfile
from zipfile import ZipFile
import os
import pandas as pd

from etl.steps.data.converters import convert_walden_metadata

## 1. Locate the dataset in Walden

In [6]:
raw_dataset = walden.Catalog().find_one("who", "2021-07-01", "ghe")

In [8]:
raw_dataset

Dataset(namespace='who', short_name='ghe', name='Global Health Estimates', description='WHO’s Global Health Estimates (GHE) provide the latest available data on death and disability globally, by region and country, and by age, sex and cause. The latest updates include global, regional and country trends from 2000 to 2019 inclusive. By providing key insights on mortality and morbidity trends, these estimates are a powerful tool to support informed decision-making on health policy and resource allocation.', source_name='World Health Organisation', url='https://www.who.int/data/global-health-estimates', date_accessed='2021-09-08', file_extension='zip', license_url='https://www.who.int/about/policies/publishing/data-policy/terms-and-conditions', source_data_url=None, md5='8339082dc1ae9a17275ad51969ebb7a2', publication_year=2021, publication_date='2021-07-01', owid_data_url='https://nyc3.digitaloceanspaces.com/walden/who/2021-07-01/ghe.zip', license_name=None, access_notes='Fetched via API 

## 2. Extract the zip file to a temporary directory

In [9]:
with tempfile.TemporaryDirectory() as dirname:
    pass

os.mkdir(dirname)
dirname

'/var/folders/0s/2yqr44dj44zcmyzdrf8fvxyc0000gn/T/tmpqb1z5dcw'

In [21]:
ZipFile(raw_dataset.local_path).extractall(dirname)

In [22]:
dirname

'/var/folders/0s/2yqr44dj44zcmyzdrf8fvxyc0000gn/T/tmpqb1z5dcw'

In [27]:
csv_file = os.path.join(dirname, "who_ghe", "_all_countries.csv")

## 3. Load the data frame and prune excess columns

In [28]:
df = pd.read_csv(csv_file)

In [30]:
df.iloc[:1].T

Unnamed: 0,0
Unnamed: 0,0
Unnamed: 0.1,0
COUNTRY_CODE,GRD
GHE_CAUSE_CODE,860
GHE_CAUSE_TITLE,Alcohol use disorders
YEAR,2019
SEX_CODE,BTSX
AGEGROUP_CODE,YEARS45-49
POPULATION,6285.0
DEATHS,0.5


In [31]:
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [32]:
df.drop([col for col in df.columns if col.startswith("Sys_")], axis=1, inplace=True)

In [33]:
df.drop([col for col in df.columns if col.startswith("FL_")], axis=1, inplace=True)

In [34]:
df.columns = [col.lower() for col in df.columns]

In [35]:
df.drop("_recordid", axis=1, inplace=True)

In [36]:
df["country_code"] = df["country_code"].astype("category")

In [37]:
df["ghe_cause_title"] = df["ghe_cause_title"].astype("category")

In [38]:
df["sex_code"] = df["sex_code"].astype("category")

In [39]:
df["agegroup_code"] = df["agegroup_code"].astype("category")

In [40]:
df.iloc[0]

country_code                         GRD
ghe_cause_code                       860
ghe_cause_title    Alcohol use disorders
year                                2019
sex_code                            BTSX
agegroup_code                 YEARS45-49
population                        6285.0
deaths                               0.5
deaths_rate                     0.000082
deaths_100k                          8.2
daly                                42.1
daly_rate                       0.006695
daly_100k                          669.5
causegroup                             2
level                                  3
Name: 0, dtype: object

## 4. Save as a dataset

In [42]:
raw_dataset

Dataset(namespace='who', short_name='ghe', name='Global Health Estimates', description='WHO’s Global Health Estimates (GHE) provide the latest available data on death and disability globally, by region and country, and by age, sex and cause. The latest updates include global, regional and country trends from 2000 to 2019 inclusive. By providing key insights on mortality and morbidity trends, these estimates are a powerful tool to support informed decision-making on health policy and resource allocation.', source_name='World Health Organisation', url='https://www.who.int/data/global-health-estimates', date_accessed='2021-09-08', file_extension='zip', license_url='https://www.who.int/about/policies/publishing/data-policy/terms-and-conditions', source_data_url=None, md5='8339082dc1ae9a17275ad51969ebb7a2', publication_year=2021, publication_date='2021-07-01', owid_data_url='https://nyc3.digitaloceanspaces.com/walden/who/2021-07-01/ghe.zip', license_name=None, access_notes='Fetched via API 

In [45]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(raw_dataset)
ds.save()

### Add cause codes

In [47]:
ghe_causes = (
    df[["ghe_cause_code", "ghe_cause_title"]]
    .drop_duplicates()
    .set_index("ghe_cause_code")
)
ghe_causes = catalog.Table(ghe_causes)

In [48]:
ghe_causes

Unnamed: 0_level_0,ghe_cause_title
ghe_cause_code,Unnamed: 1_level_1
860,Alcohol use disorders
50,Syphilis
1040,Cataracts
1220,Peptic ulcer disease
870,Drug use disorders
...,...
200,Acute hepatitis C
620,Mouth and oropharynx cancers
1550,Falls
1330,Skin diseases


In [50]:
ghe_causes.metadata = catalog.TableMeta(
    short_name="ghe_causes",
    title="GHE Cause Codes",
    description="Integer codes for common GHE causes and their human readable names",
)
ds.add(ghe_causes)

### Add estimates

In [51]:
df.drop("ghe_cause_code", axis=1, inplace=True)

In [52]:
df.head()

Unnamed: 0,country_code,ghe_cause_title,year,sex_code,agegroup_code,population,deaths,deaths_rate,deaths_100k,daly,daly_rate,daly_100k,causegroup,level
0,GRD,Alcohol use disorders,2019,BTSX,YEARS45-49,6285.0,0.5,8.2e-05,8.2,42.1,0.0066952,669.5,2,3
1,GRD,Syphilis,2019,MLE,YEARS1-4,3729.0,0.0,1e-08,0.0,0.0,5.9e-07,0.1,1,4
2,GRD,Cataracts,2019,FMLE,YEARS15-19,3736.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3
3,GRD,Peptic ulcer disease,2019,BTSX,ALLAges,112002.0,4.7,4.19e-05,4.2,126.1,0.00112614,112.6,2,3
4,GRD,Drug use disorders,2019,MLE,YEARS85PLUS,366.0,0.0,8.524e-05,8.5,0.4,0.00103158,103.2,2,3


In [54]:
estimates = catalog.Table(df)

In [56]:
estimates.set_index(
    ["country_code", "year", "ghe_cause_title", "sex_code", "agegroup_code"],
    inplace=True,
)

In [57]:
estimates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,population,deaths,deaths_rate,deaths_100k,daly,daly_rate,daly_100k,causegroup,level
country_code,year,ghe_cause_title,sex_code,agegroup_code,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
GRD,2019,Alcohol use disorders,BTSX,YEARS45-49,6285.0,0.5,8.2e-05,8.2,42.1,0.0066952,669.5,2,3
GRD,2019,Syphilis,MLE,YEARS1-4,3729.0,0.0,1e-08,0.0,0.0,5.9e-07,0.1,1,4
GRD,2019,Cataracts,FMLE,YEARS15-19,3736.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3
GRD,2019,Peptic ulcer disease,BTSX,ALLAges,112002.0,4.7,4.19e-05,4.2,126.1,0.00112614,112.6,2,3
GRD,2019,Drug use disorders,MLE,YEARS85PLUS,366.0,0.0,8.524e-05,8.5,0.4,0.00103158,103.2,2,3


In [63]:
estimates.metadata.short_name = "estimates"
estimates.metadata.description = "GHE estimated burden of disease"

In [64]:
ds.add(estimates)

KeyboardInterrupt: 

## Cleanup

In [None]:
import shutil

In [None]:
shutil.rmtree(dirname)