## Update metadata

Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified.

In [1]:
from wdi import load_variable_metadata

df_vars = load_variable_metadata()

In [49]:
import ruamel.yaml

yaml_path = "wdi.meta.yml"

with open(yaml_path, "r") as f:
    yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)

In [51]:
import re
from typing import Union


def replace_years(s: str, year: Union[int, str]) -> str:
    """replaces all years in string with {year}.

    Example:

        >>> replace_years("GDP (constant 2010 US$)", 2015)
        "GDP (constant 2015 US$)"
    """
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    s_new = year_regex.sub(str(year), s)
    return s_new


variables = yml["tables"]["wdi"]["variables"]

for indicator_code in df_vars.index:
    if indicator_code in variables:
        var = variables[indicator_code]
    else:
        var = {}
        variables[indicator_code] = var

    # update titles from metadata file
    try:
        var["title"] = df_vars.loc[indicator_code].indicator_name
    except KeyError:
        continue

    # if title contains year, try to update units too
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)
    if regex_res:
        assert len(regex_res.groups()) == 1
        year = regex_res.groups()[0]

        if "unit" in var:
            var["unit"] = replace_years(var["unit"], year)

        if "short_unit" in var:
            var["short_unit"] = replace_years(var["short_unit"], year)

        for k in ["name", "unit", "short_unit"]:
            if var.get("display", {}).get("unit"):
                var["display"]["unit"] = replace_years(var["display"]["unit"], year)

            if var.get("display", {}).get("short_unit"):
                var["display"]["short_unit"] = replace_years(var["display"]["short_unit"], year)

In [43]:
with open(yaml_path, "w") as f:
    ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)

## Update Sources

In [3]:
import json

with open("wdi.sources.json", "r") as f:
    sources = json.load(f)

sources = [s for s in sources if not s["name"].startswith("TODO")]

missing_sources = list(set(df_vars["source"]) - {s["rawName"] for s in sources})
missing_sources

[]

In [4]:
import openai
import random

SYSTEM_PROMPT = f"""
You are given list of examples in JSON format you should use for learning. Each example has 
rawName and fields name and dataPublisherSource are derived from rawName.
I'll give you a list of rawNames and you should give me a JSON list of those
rawNames with name and dataPublisherSource fields filled in.

Examples:
{json.dumps(random.sample(sources, 10))}
"""

all_sources = "\n".join(missing_sources)

messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT,
    },
    {
        "role": "user",
        "content": all_sources,
    },
]

# 10 missing sources / 5 examples -> 2min

response = openai.ChatCompletion.create(
    model="gpt-4",
    # model="gpt-3.5-turbo",
    temperature=0,
    messages=messages,
)
print(f"Cost GPT4: ${response['usage']['total_tokens'] / 1000 * 0.03:.2f}")
r = json.loads(response["choices"][0]["message"]["content"])
print(json.dumps(r, ensure_ascii=False, indent=2))

Cost GPT4: $0.08


[{'rawName': 'International Labour Organization. “Labour Market-related SDG Indicators database (ILOSDG)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.',
  'name': 'International Labour Organization (via World Bank)',
  'dataPublisherSource': 'Labour Market-related SDG Indicators Database - ILOSTAT'},
 {'rawName': 'World Bank, World Development Indicators database. Estimates are based on employment, population, GDP, and PPP data obtained from International Labour Organization, United Nations Population Division, Eurostat, OECD, and World Bank.',
  'name': 'World Bank',
  'dataPublisherSource': 'World Development Indicators Database - World Bank'},
 {'rawName': 'World Bank, World Development Indicators database. Estimates are based on data obtained from International Labour Organization and United Nations Population Division.',
  'name': 'World Bank',
  'dataPublisherSource': 'World Development Indicators Database - World Bank'},
 {'rawName': 'Derived from total pop