# Sync open numbers

## Purpose

Add all ddf datasets published by open numbers to the ETL automatically, both as github steps and as open numbers steps.

## Find all repos

In [None]:
import json

In [None]:
(repos_json,) = !gh repo list -L 1000 --json url open-numbers

In [None]:
repos = [r["url"].split("/")[-1] for r in json.loads(repos_json) if "ddf--" in r["url"]]

In [None]:
repos

## For each repo

In [None]:
import yaml

In [None]:
import os

In [None]:
from pathlib import Path

In [None]:
from etl import paths

In [None]:
with open(paths.DAG_FILE) as istream:
    dag = yaml.safe_load(istream)

In [None]:
dag["steps"]

In [None]:
import sh

In [None]:
for repo in repos:
    assert repo.startswith("ddf--")
    short_repo = repo[5:].replace("-", "_")

    # modify the DAG
    github_step = f"github://open-numbers/{repo}"
    open_numbers_step = f"data://open_numbers/open_numbers/latest/{short_repo}"
    dag["steps"][open_numbers_step] = [github_step]

    # symlink the matching python modules
    module_path = paths.STEP_DIR / "data/open_numbers/open_numbers/latest"
    module_path.mkdir(parents=True, exist_ok=True)
    module_file = module_path / f"{short_repo}.py"

    base_file = paths.BASE_DIR / "etl/open_numbers.py"
    sh.rm("-f", module_file.as_posix())
    sh.ln("-s", "../../../../open_numbers.py", module_file.name, _cwd=module_file.parent)

In [None]:
dag

## Save the DAG

In [None]:
with open(paths.DAG_FILE.as_posix(), "w") as ostream:
    yaml.safe_dump(dag, ostream)