# WHO GHE (2021-07-01)

In [1]:
dest_dir = "/tmp/ghe_20210701"

In [2]:
from owid import walden, catalog  # type: ignore
import tempfile
from zipfile import ZipFile
import os
import pandas as pd

from etl.steps.data.converters import convert_walden_metadata

## 1. Locate the dataset in Walden

In [3]:
raw_dataset = walden.Catalog().find_one("who", "2021-07-01", "ghe")

In [4]:
raw_dataset

Dataset(namespace='who', short_name='ghe', name='Global Health Estimates', description='WHO’s Global Health Estimates (GHE) provide the latest available data on death and disability globally, by region and country, and by age, sex and cause. The latest updates include global, regional and country trends from 2000 to 2019 inclusive. By providing key insights on mortality and morbidity trends, these estimates are a powerful tool to support informed decision-making on health policy and resource allocation.', source_name='World Health Organisation', url='https://www.who.int/data/global-health-estimates', file_extension='zip', date_accessed='2021-09-08', source_data_url=None, license_url='https://www.who.int/about/policies/publishing/data-policy/terms-and-conditions', license_name=None, access_notes='Fetched via API using this notebook: https://gist.github.com/spoonerf/9646dce7452583472dc2ac8ddf210835', is_public=True, version='2021-07-01', publication_year=2021, publication_date='2021-07-0

## 2. Extract the zip file to a temporary directory

In [5]:
with tempfile.TemporaryDirectory() as dirname:
    pass

os.mkdir(dirname)
dirname

'/var/folders/rz/kpg1phc51j5czjqsdmq8fttc0000gn/T/tmpxfz56ng0'

In [6]:
ZipFile(raw_dataset.local_path).extractall(dirname)

In [None]:
dirname

In [None]:
csv_file = os.path.join(dirname, "who_ghe", "_all_countries.csv")

## 3. Load the data frame and prune excess columns

In [None]:
df = pd.read_csv(csv_file)

In [None]:
df.iloc[:1].T

In [None]:
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [None]:
df.drop([col for col in df.columns if col.startswith("Sys_")], axis=1, inplace=True)

In [None]:
df.drop([col for col in df.columns if col.startswith("FL_")], axis=1, inplace=True)

In [None]:
df.columns = [col.lower() for col in df.columns]

In [None]:
df.drop("_recordid", axis=1, inplace=True)

In [None]:
df["country_code"] = df["country_code"].astype("category")

In [None]:
df["ghe_cause_title"] = df["ghe_cause_title"].astype("category")

In [None]:
df["sex_code"] = df["sex_code"].astype("category")

In [None]:
df["agegroup_code"] = df["agegroup_code"].astype("category")

In [None]:
df.iloc[0]

## 4. Save as a dataset

In [None]:
raw_dataset

In [None]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(raw_dataset)
ds.save()

### Add cause codes

In [None]:
ghe_causes = df[["ghe_cause_code", "ghe_cause_title"]].drop_duplicates().set_index("ghe_cause_code")
ghe_causes = catalog.Table(ghe_causes)

In [None]:
ghe_causes

In [None]:
ghe_causes.metadata = catalog.TableMeta(
    short_name="ghe_causes",
    title="GHE Cause Codes",
    description="Integer codes for common GHE causes and their human readable names",
)
ds.add(ghe_causes)

### Add estimates

In [None]:
df.drop("ghe_cause_code", axis=1, inplace=True)

In [None]:
df.head()

In [None]:
estimates = catalog.Table(df)

In [None]:
estimates.set_index(
    ["country_code", "year", "ghe_cause_title", "sex_code", "agegroup_code"],
    inplace=True,
)

In [None]:
estimates.head()

In [None]:
estimates.metadata.short_name = "estimates"
estimates.metadata.description = "GHE estimated burden of disease"

In [None]:
ds.add(estimates)

## Cleanup

In [None]:
import shutil

In [None]:
shutil.rmtree(dirname)