# WHO Global Health Observatory

## Parameters

In [1]:
dest_dir = "/tmp/gho_20210701"

## Imports

In [2]:
import tempfile
from os import path, listdir
import shutil
import zipfile
from typing import List
import glob
import hashlib

import pandas as pd

from owid import walden, catalog
from etl.steps.data import converters

## 1. Fetch from walden

In [3]:
raw_dataset = walden.Catalog().find_one("who", "2021-07-01", "gho")

In [4]:
raw_dataset

Dataset(namespace='who', short_name='gho', name='Global Health Observatory', description="The GHO data repository is WHO's gateway to health-related statistics for its 194 Member States. It provides access to over 1000 indicators on priority health topics including mortality and burden of diseases, the Millennium Development Goals (child nutrition, child health, maternal and reproductive health, immunization, HIV/AIDS, tuberculosis, malaria, neglected diseases, water and sanitation), non communicable diseases and risk factors, epidemic-prone diseases, health systems, environmental health, violence and injuries, equity among others.", source_name='World Health Organisation', url='https://www.who.int/data/gho/info/gh', date_accessed='2021-09-07', file_extension='zip', license_url='https://www.who.int/about/policies/publishing/data-policy/terms-and-conditions', source_data_url=None, md5='e258158a79803b6b7d472432e8764078', publication_year=2021, publication_date='2021-07-01', owid_data_url

In [5]:
raw_dataset.local_path

'/Users/yc/.owid/walden/who/2021-07-01/gho.zip'

## 2. Unzip

In [6]:
tmp_dir = tempfile.mkdtemp(prefix="etl-")

In [7]:
tmp_dir

'/var/folders/_2/5kwj92cj3t38qrh879gy86680000gq/T/etl-v_sr__8v'

In [8]:
zipfile.ZipFile(raw_dataset.local_path).extractall(tmp_dir)

In [9]:
src_dir = path.join(tmp_dir, "who_gho")

In [10]:
csv_files = {
    path.basename(f)[:-4]: f for f in sorted(glob.glob(path.join(src_dir, "*.csv")))
}

In [11]:
len(csv_files)

2369

In [12]:
list(csv_files.keys())[:5]

['AIR_1', 'AIR_10', 'AIR_11', 'AIR_12', 'AIR_13']

## 3. Make a dataset container

In [13]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = converters.convert_walden_metadata(raw_dataset)
ds.save()

## 4. Load the set of indicators

In [14]:
ind = pd.read_csv(csv_files["_indicators"])
ind.head()

Unnamed: 0,IndicatorCode,IndicatorName,Language
0,AIR_1,Ambient air pollution attributable deaths,EN
1,AIR_10,Ambient air pollution attributable DALYs per ...,EN
2,AIR_11,Household air pollution attributable deaths,EN
3,AIR_12,Household air pollution attributable deaths in...,EN
4,AIR_13,Household air pollution attributable deaths pe...,EN


In [15]:
del ind["Language"]

In [16]:
ind.columns = ["orig_code", "title"]

In [17]:
ind["code"] = ind.orig_code.apply(lambda s: s.lower())

In [18]:
ind.set_index("code", inplace=True)

In [19]:
ind.head()

Unnamed: 0_level_0,orig_code,title
code,Unnamed: 1_level_1,Unnamed: 2_level_1
air_1,AIR_1,Ambient air pollution attributable deaths
air_10,AIR_10,Ambient air pollution attributable DALYs per ...
air_11,AIR_11,Household air pollution attributable deaths
air_12,AIR_12,Household air pollution attributable deaths in...
air_13,AIR_13,Household air pollution attributable deaths pe...


In [20]:
t = catalog.Table(ind)
t.metadata.short_name = "indicators"
t.metadata.title = "List of all indicators provided in the GHE dataset"
ds.add(t)

## 5. Add each table

In [21]:
def transform_table(df: pd.DataFrame) -> List[catalog.Table]:
    """
    We have have multiple different primary keys here.
    """
    df = df.copy()
    del df["Id"]

    assert len(df["IndicatorCode"].unique()) == 1
    indicator = df["IndicatorCode"].iloc[0].lower()
    del df["IndicatorCode"]

    # prefix the geo code for everything except countries, to avoid confusion
    df["geo"] = [
        get_geo(_type, code)
        for _type, code in zip(df.pop("SpatialDimType"), df.pop("SpatialDim"))
    ]

    tables = []
    for keys, st in df.groupby(
        ["TimeDimType", "Dim1Type", "Dim2Type", "Dim3Type", "DataSourceDimType"],
        dropna=False,
        as_index=False,
    ):
        st = st.copy()

        dims = ["geo"]
        for dim in ["TimeDim", "Dim1", "Dim2", "Dim3", "DataSourceDim"]:
            dim_type = dim + "Type"

            # not all dimensions are used
            if pd.isnull(st[dim]).all():
                del st[dim]
                del st[dim_type]
                continue

            assert len(st[dim_type].unique()) == 1

            col = st[dim_type].dropna().iloc[0].lower()
            del st[dim_type]
            st.rename({dim: col}, axis=1, inplace=True)

            dims.append(col)

        st.set_index(dims, inplace=True)

        # if any rows are all empty, just prune them
        st.dropna(how="all")

        # fix the value column
        if not st.NumericValue.isnull().all():
            st.rename({"NumericValue": indicator}, axis=1, inplace=True)
            del st["Value"]
        else:
            st.rename({"Value": indicator}, axis=1, inplace=True)
            del st["NumericValue"]

        del st["TimeDimensionValue"]
        del st["TimeDimensionBegin"]
        del st["TimeDimensionEnd"]

        for col in ["Low", "High", "Comments"]:
            if not st[col].isnull().all():
                st.rename({col: f"{indicator}_{col.lower()}"}, axis=1, inplace=True)
            else:
                del st[col]

        del st["Date"]

        t = catalog.Table(st)
        t.metadata.short_name = indicator
        tables.append(t)

    if len(tables) > 1:
        # rename each one to make it unique
        for t in tables:
            _hash = hashlib.md5(",".join(t.primary_key).encode("utf8")).hexdigest()
            t.metadata.short_name += "_" + _hash[:4]

    for t in tables:
        t.metadata.title = ind.loc[indicator, "title"]

    return tables


def get_geo(_type, code):
    if pd.isnull(code):
        return None
    if _type == "COUNTRY":
        return code
    return f"{_type.lower()}:{code}"

In [22]:
NA_VALUES = [
    "",
    "Data not available",
    "Not applicable",
    "Not available",
    "Not available.",
]

for indicator, filename in sorted(csv_files.items()):
    if indicator.startswith("_"):
        # skip metadata
        continue

    print(indicator)
    df = pd.read_csv(filename, na_values=NA_VALUES)
    for t in transform_table(df):
        print("  ", t.metadata.short_name, t.primary_key, "-->", [c for c in t.columns])
        ds.add(t)
    print()

AIR_1
   air_1 ['geo', 'year'] --> ['air_1', 'air_1_low', 'air_1_high']

AIR_10
   air_10 ['geo', 'year'] --> ['air_10']

AIR_11
   air_11 ['geo', 'year', 'sex', 'envcause'] --> ['air_11', 'air_11_low', 'air_11_high']

AIR_12
   air_12 ['geo', 'year', 'sex', 'envcause'] --> ['air_12', 'air_12_low', 'air_12_high']

AIR_13
   air_13 ['geo', 'year'] --> ['air_13']

AIR_14
   air_14 ['geo', 'year'] --> ['air_14']

AIR_15
   air_15 ['geo', 'year', 'sex', 'envcause'] --> ['air_15', 'air_15_low', 'air_15_high']

AIR_16
   air_16 ['geo', 'year', 'sex', 'envcause'] --> ['air_16', 'air_16_low', 'air_16_high']

AIR_17
   air_17 ['geo', 'year', 'sex', 'envcause'] --> ['air_17', 'air_17_low', 'air_17_high']

AIR_18
   air_18 ['geo', 'year'] --> ['air_18']

AIR_2
   air_2 ['geo', 'year'] --> ['air_2']

AIR_2_1
   air_2_1 ['geo', 'year'] --> ['air_2_1']

AIR_3
   air_3 ['geo', 'year'] --> ['air_3']

AIR_39
   air_39 ['geo', 'year', 'sex', 'envcause'] --> ['air_39', 'air_39_low', 'air_39_high']

AIR_3

  exec(code_obj, self.user_global_ns, self.user_ns)


   ghe_yldnum ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['ghe_yldnum']

GHE_YLDRATE
   ghe_yldrate ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['ghe_yldrate']

GHE_YLLNUM
   ghe_yllnum ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['ghe_yllnum']

GHE_YLLRATE
   ghe_yllrate ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['ghe_yllrate']

GOE_Q001
   goe_q001 ['geo', 'year', 'goequestion'] --> ['goe_q001']

GOE_Q002
   goe_q002 ['geo', 'year', 'goequestion'] --> ['goe_q002']

GOE_Q003
   goe_q003 ['geo', 'year', 'goequestion'] --> ['goe_q003']

GOE_Q004
   goe_q004 ['geo', 'year', 'goequestion'] --> ['goe_q004']

GOE_Q006
   goe_q006 ['geo', 'year', 'goequestion'] --> ['goe_q006']

GOE_Q007
   goe_q007 ['geo', 'year', 'goequestion'] --> ['goe_q007']

GOE_Q008
   goe_q008 ['geo', 'year', 'goequestion'] --> ['goe_q008']

GOE_Q009
   goe_q009 ['geo', 'year', 'goequestion'] --> ['goe_q009']

GOE_Q010
   goe_q010 ['geo', 'year', 'goequestion'] --> ['goe_q010']

GOE

  exec(code_obj, self.user_global_ns, self.user_ns)


   mort_300_9b73 ['geo', 'year', 'agegroup', 'childcause'] --> ['mort_300']
   mort_300_f0b6 ['geo', 'year', 'sex', 'agegroup', 'childcause'] --> ['mort_300']

MORT_400
   mort_400 ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['mort_400']

MORT_500
   mort_500 ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['mort_500']

MORT_600
   mort_600 ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['mort_600']

MORT_700
   mort_700 ['geo', 'year', 'sex', 'agegroup', 'ghecauses'] --> ['mort_700']

MORT_MATERNALNUM
   mort_maternalnum ['geo', 'year'] --> ['mort_maternalnum', 'mort_maternalnum_low', 'mort_maternalnum_high']

M_Est_cig_curr
   m_est_cig_curr ['geo', 'year', 'sex'] --> ['m_est_cig_curr', 'm_est_cig_curr_low', 'm_est_cig_curr_high']

M_Est_cig_curr_std
   m_est_cig_curr_std ['geo', 'year', 'sex'] --> ['m_est_cig_curr_std', 'm_est_cig_curr_std_low', 'm_est_cig_curr_std_high']

M_Est_cig_daily
   m_est_cig_daily ['geo', 'year', 'sex'] --> ['m_est_cig_daily', 'm_est_cig_d

  exec(code_obj, self.user_global_ns, self.user_ns)


   ncd_bmi_meanc_7264 ['geo', 'year', 'sex', 'agegroup'] --> ['ncd_bmi_meanc', 'ncd_bmi_meanc_low', 'ncd_bmi_meanc_high', 'ncd_bmi_meanc_comments']
   ncd_bmi_meanc_7264 ['geo', 'year', 'sex', 'agegroup'] --> ['ncd_bmi_meanc', 'ncd_bmi_meanc_low', 'ncd_bmi_meanc_high', 'ncd_bmi_meanc_comments']

NCD_BMI_MINUS2C


  exec(code_obj, self.user_global_ns, self.user_ns)


   ncd_bmi_minus2c ['geo', 'year', 'sex', 'agegroup'] --> ['ncd_bmi_minus2c', 'ncd_bmi_minus2c_low', 'ncd_bmi_minus2c_high', 'ncd_bmi_minus2c_comments']

NCD_BMI_PLUS1C
   ncd_bmi_plus1c ['geo', 'year', 'sex', 'agegroup'] --> ['ncd_bmi_plus1c', 'ncd_bmi_plus1c_low', 'ncd_bmi_plus1c_high', 'ncd_bmi_plus1c_comments']

NCD_BMI_PLUS2C
   ncd_bmi_plus2c ['geo', 'year', 'sex', 'agegroup'] --> ['ncd_bmi_plus2c', 'ncd_bmi_plus2c_low', 'ncd_bmi_plus2c_high', 'ncd_bmi_plus2c_comments']

NCD_CCS_ACE
   ncd_ccs_ace ['geo', 'year'] --> ['ncd_ccs_ace', 'ncd_ccs_ace_comments']

NCD_CCS_ARB
   ncd_ccs_arb ['geo', 'year'] --> ['ncd_ccs_arb']

NCD_CCS_AlcPlan
   ncd_ccs_alcplan ['geo', 'year'] --> ['ncd_ccs_alcplan', 'ncd_ccs_alcplan_comments']

NCD_CCS_Aspirin
   ncd_ccs_aspirin ['geo', 'year'] --> ['ncd_ccs_aspirin', 'ncd_ccs_aspirin_comments']

NCD_CCS_BowelCancer
   ncd_ccs_bowelcancer ['geo', 'year'] --> ['ncd_ccs_bowelcancer', 'ncd_ccs_bowelcancer_comments']

NCD_CCS_BreastCancer
   ncd_ccs_breast

  exec(code_obj, self.user_global_ns, self.user_ns)


   sa_0000001400_archived_f1cb ['geo', 'year', 'alcoholtype', 'archive', 'datasource'] --> ['sa_0000001400_archived']
   sa_0000001400_archived_229f ['geo', 'year', 'alcoholtype', 'archive'] --> ['sa_0000001400_archived']
   sa_0000001400_archived_a4a9 ['geo', 'year', 'alcoholtype', 'datasource'] --> ['sa_0000001400_archived']
   sa_0000001400_archived_0afd ['geo', 'year', 'alcoholtype'] --> ['sa_0000001400_archived']

SA_0000001401
   sa_0000001401 ['geo', 'year'] --> ['sa_0000001401', 'sa_0000001401_comments']

SA_0000001401_ARCHIVED
   sa_0000001401_archived ['geo', 'year'] --> ['sa_0000001401_archived', 'sa_0000001401_archived_comments']

SA_0000001402
   sa_0000001402 ['geo', 'year'] --> ['sa_0000001402', 'sa_0000001402_comments']

SA_0000001402_ARCHIVED
   sa_0000001402_archived ['geo', 'year'] --> ['sa_0000001402_archived', 'sa_0000001402_archived_comments']

SA_0000001403
   sa_0000001403 ['geo', 'year', 'sex'] --> ['sa_0000001403', 'sa_0000001403_comments']

SA_0000001403_ARCH