# Sync open numbers

## Purpose

Add all ddf datasets published by open numbers to the ETL automatically, both as github steps and as meadow steps.

## Find all repos

In [61]:
import json

In [62]:
(repos_json,) = !gh repo list -L 1000 --json url open-numbers

In [63]:
repos = [r["url"].split("/")[-1] for r in json.loads(repos_json) if "ddf--" in r["url"]]

In [64]:
repos

['ddf--worldbank--povcalnet',
 'ddf--oxford--covid_government_response',
 'ddf--gapminder--systema_globalis',
 'ddf--open_numbers--world_development_indicators',
 'ddf--open_numbers--covid_19_geographic_distribution',
 'ddf--world_bank--world_development_indicators',
 'ddf--clio_infra--indicators',
 'ddf--ilo--ilostat',
 'ddf--oecd--dac',
 'ddf--gapminder--fasttrack',
 'ddf--open_numbers--covid_government_response',
 'ddf--unstats--sdg_indicators',
 'ddf--who--tb_burden_estimates',
 'ddf--global_carbon_project--global_carbon_budget',
 'ddf--gapminder--child_mortality',
 'ddf--gapminder--gini',
 'ddf--gapminder--population_historic',
 'ddf--gapminder--gdp_per_capita_cppp',
 'ddf--gapminder--life_expectancy',
 'ddf--unpop--world_population_prospects',
 'ddf--cait--historical_emissions',
 'ddf--gapminder--co2_emission',
 'ddf--gapminder--population',
 'ddf--unfao--faostat',
 'ddf--semio--dag_test',
 'ddf--gapminder--population_by_income_group',
 'ddf--open_numbers',
 'ddf--gapminder--gapm

## For each repo

In [65]:
import yaml

In [66]:
import os

In [67]:
from pathlib import Path

In [68]:
from etl import paths

In [69]:
with open(paths.DAG_FILE) as istream:
    dag = yaml.safe_load(istream)

In [70]:
dag["steps"]

{'data://garden/owid/latest/covid': ['etag://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'],
 'data://garden/owid/latest/key_indicators': ['data://meadow/hyde/2017/baseline',
  'data://meadow/gapminder/2019-12-10/population',
  'data://reference'],
 'data://garden/who/2021-07-01/ghe': ['data://meadow/who/2021-07-01/ghe',
  'data://reference'],
 'data://meadow/gapminder/2019-12-10/population': ['walden://gapminder/2019-12-10/population'],
 'data://meadow/hyde/2017/baseline': ['walden://hyde/2017/baseline',
  'data://meadow/hyde/2017/general_files'],
 'data://meadow/hyde/2017/general_files': ['walden://hyde/2017/general_files'],
 'data://meadow/who/2021-07-01/ghe': ['walden://who/2021-07-01/ghe'],
 'data://meadow/who/2021-07-01/gho': ['walden://who/2021-07-01/gho'],
 'data://meadow/wpp/2019/standard_projections': ['walden://wpp/2019/standard_projections'],
 'grapher://who/2021-07-01/ghe': ['data://garden/who/2021-07-01/ghe']}

In [71]:
import sh

In [72]:
for repo in repos:
    assert repo.startswith("ddf--")
    short_repo = repo[5:].replace("-", "_")

    # modify the DAG
    github_step = f"github://open-numbers/{repo}"
    meadow_step = f"data://meadow/open_numbers/latest/{short_repo}"
    dag["steps"][meadow_step] = [github_step]

    # symlink the matching python modules
    module_path = paths.STEP_DIR / "data/meadow/open_numbers/latest"
    module_path.mkdir(parents=True, exist_ok=True)
    module_file = module_path / f"{short_repo}.py"

    base_file = paths.BASE_DIR / "etl/open_numbers.py"
    sh.rm("-f", module_file.as_posix())
    sh.ln(
        "-s", "../../../../open_numbers.py", module_file.name, _cwd=module_file.parent
    )

In [73]:
dag

{'steps': {'data://garden/owid/latest/covid': ['etag://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'],
  'data://garden/owid/latest/key_indicators': ['data://meadow/hyde/2017/baseline',
   'data://meadow/gapminder/2019-12-10/population',
   'data://reference'],
  'data://garden/who/2021-07-01/ghe': ['data://meadow/who/2021-07-01/ghe',
   'data://reference'],
  'data://meadow/gapminder/2019-12-10/population': ['walden://gapminder/2019-12-10/population'],
  'data://meadow/hyde/2017/baseline': ['walden://hyde/2017/baseline',
   'data://meadow/hyde/2017/general_files'],
  'data://meadow/hyde/2017/general_files': ['walden://hyde/2017/general_files'],
  'data://meadow/who/2021-07-01/ghe': ['walden://who/2021-07-01/ghe'],
  'data://meadow/who/2021-07-01/gho': ['walden://who/2021-07-01/gho'],
  'data://meadow/wpp/2019/standard_projections': ['walden://wpp/2019/standard_projections'],
  'grapher://who/2021-07-01/ghe': ['data://garden/who/2021-07-01/ghe'],


## Save the DAG

In [75]:
with open(paths.DAG_FILE.as_posix(), "w") as ostream:
    yaml.safe_dump(dag, ostream)