# WHO GHE (latest)

In [None]:
dest_dir = "/tmp/ghe_latest"

In [None]:
from owid import walden, catalog  # type: ignore
import tempfile
from zipfile import ZipFile
import os
import pandas as pd

from etl.steps.data.converters import convert_walden_metadata

## 1. Locate the dataset in Walden

In [None]:
raw_dataset = walden.Catalog().find_latest("who", "ghe")

In [None]:
raw_dataset

## 2. Extract the zip file to a temporary directory

In [None]:
with tempfile.TemporaryDirectory() as dirname:
    pass

os.mkdir(dirname)
dirname

In [None]:
ZipFile(raw_dataset.local_path).extractall(dirname)

In [None]:
csv_file = os.path.join(dirname, "who_ghe", "_all_countries.csv")

## 3. Load the data frame and prune excess columns

In [None]:
df = pd.read_csv(csv_file)

In [None]:
df.iloc[:1].T

In [None]:
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [None]:
df.drop([col for col in df.columns if col.startswith("Sys_")], axis=1, inplace=True)

In [None]:
df.drop([col for col in df.columns if col.startswith("FL_")], axis=1, inplace=True)

In [None]:
df.columns = [col.lower() for col in df.columns]

In [None]:
df.drop("_recordid", axis=1, inplace=True)

In [None]:
df.iloc[0]

## 4. Save as a dataset

In [None]:
raw_dataset

In [None]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(raw_dataset)
ds.save()

### Add cause codes

In [None]:
ghe_causes = (
    df[["ghe_cause_code", "ghe_cause_title"]]
    .drop_duplicates()
    .set_index("ghe_cause_code")
)
ghe_causes = catalog.Table(ghe_causes)

In [None]:
ghe_causes

In [None]:
ghe_causes.metadata = catalog.TableMeta(
    short_name="ghe_causes",
    title="GHE Cause Codes",
    description="Integer codes for common GHE causes and their human readable names",
)
ds.add(ghe_causes)

### Add estimates

In [None]:
df.drop("ghe_cause_code", axis=1, inplace=True)

In [None]:
df.head()

In [None]:
estimates = catalog.Table(df)

In [None]:
estimates.set_index(
    ["country_code", "year", "ghe_cause_title", "sex_code", "agegroup_code"],
    inplace=True,
)

In [None]:
estimates.head()

In [None]:
estimates.metadata.short_name = "estimates"
estimates.metadata.description = "GHE estimated burden of disease"

In [None]:
ds.add(estimates)

## Cleanup

In [None]:
import shutil

In [None]:
shutil.rmtree(dirname)