In [67]:
dest_dir = "/tmp/standard_projections"

In [68]:
from owid import catalog
import json
from pathlib import Path
import pandas as pd

from etl.paths import BASE_DIR as base_path

d = catalog.Dataset(
    (base_path / "data/meadow/wpp/2019/standard_projections").as_posix()
)

# all tables in dataset
d._data_files

['/Users/mojmir/projects/etl/data/meadow/wpp/2019/standard_projections/fertility_by_age.feather',
 '/Users/mojmir/projects/etl/data/meadow/wpp/2019/standard_projections/location_codes.feather',
 '/Users/mojmir/projects/etl/data/meadow/wpp/2019/standard_projections/population_by_age_sex.feather',
 '/Users/mojmir/projects/etl/data/meadow/wpp/2019/standard_projections/total_population.feather',
 '/Users/mojmir/projects/etl/data/meadow/wpp/2019/standard_projections/variant_codes.feather']

## Harmonize country names

In [69]:
from etl.paths import STEP_DIR

# TODO: is there a better way? it's hard to obtain notebook's path
# maybe add it as a papermill parameter?
with open(
    STEP_DIR / "data/garden/wpp/2019/standard_projections_country_mappings.json"
) as f:
    country_mappings = json.load(f)

In [75]:
import random

empty_dataset = catalog.Dataset.create_empty(dest_dir)
empty_dataset.metadata = d.metadata

for t in d:
    # skip these since we already have harmonized country names
    if t.metadata.short_name in ("location_codes", "variant_codes"):
        continue

    tc = t.reset_index()

    harmonized_locations = tc.location.map(country_mappings)
    unassigned_locations = set(tc.location[harmonized_locations.isnull()])

    # TODO: it would be nice to have ignored locations in `standard_projections_country_mappings.json` so that it
    # is clear during review what have we ignored
    print(
        f"Unassigned {len(unassigned_locations)} locations, examples {random.sample(list(unassigned_locations), 10)}"
    )

    # harmonize
    tc = tc[harmonized_locations.notnull()]
    tc.location = tc.location.map(country_mappings)

    # remove duplicate countries
    # TODO: make sure our mapping to countries does not result in two different values
    tc = tc.drop_duplicates(subset=t.index.names)

    tc = tc.set_index(t.index.names)

    # TODO: I would expect `add` to only assign table to dataset, not save it, and then `save` to save everything in dest_dir
    # (similarly to commit), but perhaps that isn't memory efficient in case of large datasets?
    empty_dataset.add(tc)

Unassigned 233 locations, examples ['Asia-Pacific Economic Cooperation (APEC)', 'African Group', 'World Bank Regional Groups (developing only)', 'Land-locked Countries', 'Shanghai Cooperation Organization (SCO)', 'WB region: Europe and Central Asia (excluding high income)', 'State of Palestine', 'ESCAP: WB income groups', 'WHO: Eastern Mediterranean Region (EMRO)', 'United Nations Economic Commission for Africa (UN-ECA)']


In [51]:
empty_dataset.path

'/tmp/standard_projections'

In [52]:
# save dataset to dest_dir
# TODO: it is confusing that this doesn't save the data, but only metadata that has been already added through `add`
empty_dataset.save()