# WHO Global Health Observatory

## Parameters

In [None]:
dest_dir = "/tmp/gho_20210701"

## Imports

In [None]:
import tempfile
from os import path, listdir
import shutil
import zipfile
from typing import List
import glob
import hashlib

import pandas as pd

from owid import walden, catalog
from etl.steps.data import converters

## 1. Fetch from walden

In [None]:
raw_dataset = walden.Catalog().find_one("who", "2021-07-01", "gho")

In [None]:
raw_dataset

In [None]:
raw_dataset.local_path

## 2. Unzip

In [None]:
tmp_dir = tempfile.mkdtemp(prefix="etl-")

In [None]:
tmp_dir

In [None]:
zipfile.ZipFile(raw_dataset.local_path).extractall(tmp_dir)

In [None]:
src_dir = path.join(tmp_dir, "who_gho")

In [None]:
csv_files = {
    path.basename(f)[:-4]: f for f in sorted(glob.glob(path.join(src_dir, "*.csv")))
}

In [None]:
len(csv_files)

In [None]:
list(csv_files.keys())[:5]

## 3. Make a dataset container

In [None]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = converters.convert_walden_metadata(raw_dataset)
ds.save()

## 4. Load the set of indicators

In [None]:
ind = pd.read_csv(csv_files["_indicators"])
ind.head()

In [None]:
del ind["Language"]

In [None]:
ind.columns = ["orig_code", "title"]

In [None]:
from owid.catalog import utils

ind["code"] = ind.orig_code.apply(utils.underscore)

In [None]:
ind.set_index("code", inplace=True)

In [None]:
ind.head()

In [None]:
t = catalog.Table(ind)
t.metadata.short_name = "indicators"
t.metadata.title = "List of all indicators provided in the GHE dataset"
ds.add(t)

## 5. Add each table

In [None]:
def transform_table(df: pd.DataFrame) -> List[catalog.Table]:
    """
    We have have multiple different primary keys here.
    """
    df = df.copy()
    del df["Id"]

    assert len(df["IndicatorCode"].unique()) == 1
    indicator = utils.underscore(df["IndicatorCode"].iloc[0])
    del df["IndicatorCode"]

    # prefix the geo code for everything except countries, to avoid confusion
    df["geo"] = [
        get_geo(_type, code)
        for _type, code in zip(df.pop("SpatialDimType"), df.pop("SpatialDim"))
    ]

    tables = []
    for keys, st in df.groupby(
        ["TimeDimType", "Dim1Type", "Dim2Type", "Dim3Type", "DataSourceDimType"],
        dropna=False,
        as_index=False,
    ):
        st = st.copy()

        dims = ["geo"]
        for dim in ["TimeDim", "Dim1", "Dim2", "Dim3", "DataSourceDim"]:
            dim_type = dim + "Type"

            # not all dimensions are used
            if pd.isnull(st[dim]).all():
                del st[dim]
                del st[dim_type]
                continue

            assert len(st[dim_type].unique()) == 1

            col = st[dim_type].dropna().iloc[0].lower()
            del st[dim_type]
            st.rename({dim: col}, axis=1, inplace=True)

            dims.append(col)

        st.set_index(dims, inplace=True)

        # if any rows are all empty, just prune them
        st.dropna(how="all")

        # fix the value column
        if not st.NumericValue.isnull().all():
            st.rename({"NumericValue": indicator}, axis=1, inplace=True)
            del st["Value"]
        else:
            st.rename({"Value": indicator}, axis=1, inplace=True)
            del st["NumericValue"]

        del st["TimeDimensionValue"]
        del st["TimeDimensionBegin"]
        del st["TimeDimensionEnd"]

        for col in ["Low", "High", "Comments"]:
            if not st[col].isnull().all():
                st.rename({col: f"{indicator}_{col.lower()}"}, axis=1, inplace=True)
            else:
                del st[col]

        del st["Date"]

        t = catalog.Table(st)
        t.metadata.short_name = indicator
        tables.append(t)

    if len(tables) > 1:
        # rename each one to make it unique
        for t in tables:
            _hash = hashlib.md5(",".join(t.primary_key).encode("utf8")).hexdigest()
            t.metadata.short_name += "_" + _hash[:4]

    for t in tables:
        t.metadata.title = ind.loc[indicator, "title"]

    return tables


def get_geo(_type, code):
    if pd.isnull(code):
        return None
    if _type == "COUNTRY":
        return code
    return f"{_type.lower()}:{code}"

In [None]:
NA_VALUES = [
    "",
    "Data not available",
    "Not applicable",
    "Not available",
    "Not available.",
]

for indicator, filename in sorted(csv_files.items()):
    if indicator.startswith("_"):
        # skip metadata
        continue

    print(indicator)
    df = pd.read_csv(filename, na_values=NA_VALUES)
    for t in transform_table(df):
        print("  ", t.metadata.short_name, t.primary_key, "-->", [c for c in t.columns])
        ds.add(t)
    print()