## Generate SDG mapping from goals to variables and datasets

We list all `goals -> variables` mappings from https://sdg-tracker.org/. The original `goal -> variable` mapping was in [this spreadsheet](https://docs.google.com/spreadsheets/d/1n0UrpKKS2JVcXSmth_QVYLThlWev6pzEiHP7HaIq9BY/edit#gid=1284188229), but it's not up to date anymore. Instead we get them by scraping SDG tracker files from github. After that we enhance it with data from grapher DB and generate a new CSV file to be used by ETL.

In [None]:
import requests

js = requests.get(
    "https://api.github.com/repos/owid/sdg-tracker.org/git/trees/master?recursive=1"
).json()

In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict

charts = defaultdict(list)

for p in js["tree"]:
    if not p["path"].startswith("pages"):
        continue

    resp = requests.get(
        f"https://raw.githubusercontent.com/owid/sdg-tracker.org/master/{p['path']}"
    )
    if resp.status_code != 200:
        continue

    soup = BeautifulSoup(resp.text, "html.parser")

    for div_indicator in soup.find_all("div", {"class": "indicator"}):
        if "id" not in div_indicator.attrs:
            raise Exception(
                f'Page {p["path"]} is missing id=[indicator] in <div class="indicator">'
            )

        for iframe_chart in div_indicator.find_all("iframe"):
            charts[div_indicator.attrs["id"]].append(iframe_chart.attrs["src"])

In [None]:
import pandas as pd

df = pd.DataFrame(
    [(indicator, chart) for indicator, charts in charts.items() for chart in charts],
    columns=["indicator", "chart"],
)
df.head()

In [None]:
df["chart_slug"] = df.chart.str.split("/").str.get(-1)

In [None]:
from etl.db import get_engine

engine = get_engine()

# get variable id -> dataset id relationship
q = """
select
    c.id as chart_id,
    v.id as variable_id,
    v.name as variable_name,
    d.id as dataset_id,
    d.name as dataset_name,
    c.config->>"$.slug" as chart_slug
from variables as v
join datasets as d on d.id = v.datasetId
join chart_dimensions as cd on cd.variableId = v.id
join charts as c on c.id = cd.chartId
where c.config->>"$.slug" in %(slugs)s
    and d.isPrivate is false
"""
gf = pd.read_sql(q, engine, params={"slugs": df.chart_slug.tolist()})

In [None]:
df = df.merge(gf, on="chart_slug")

In [None]:
from owid.catalog.utils import underscore

df["dataset_name"] = (
    "dataset_" + df.dataset_id.astype(str) + "_" + df.dataset_name.map(underscore)
)

In [None]:
# arguments for bulk_backport
"-d " + " -d ".join(list(set(df.dataset_id.map(str))))

In [None]:
# dependencies for DAG file
print(
    "\n".join(
        [f"- backport://backport/owid/latest/{n}" for n in set(df["dataset_name"])]
    )
)

In [None]:
df.to_csv("sdg_sources.csv", index=False)