## Update metadata

Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified.

In [1]:
# TODO: remove this
from myml.nbinit import *

In [2]:
from wdi import load_variable_metadata
df_vars = load_variable_metadata()

In [49]:
import ruamel.yaml

yaml_path = "wdi.meta.yml"

with open(yaml_path, "r") as f:
    yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)

In [51]:
import re
from typing import Union

def replace_years(s: str, year: Union[int, str]) -> str:
    """replaces all years in string with {year}.

    Example:

        >>> replace_years("GDP (constant 2010 US$)", 2015)
        "GDP (constant 2015 US$)"
    """
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    s_new = year_regex.sub(str(year), s)
    return s_new

variables = yml['tables']['wdi']['variables']

for indicator_code in df_vars.index:
    if indicator_code in variables:
        var = variables[indicator_code]
    else:
        var = {}
        variables[indicator_code] = var

    # update titles from metadata file
    try:
        var['title'] = df_vars.loc[indicator_code].indicator_name
    except KeyError:
        continue    

    # if title contains year, try to update units too
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)
    if regex_res:
        assert len(regex_res.groups()) == 1
        year = regex_res.groups()[0]

        if 'unit' in var:
            var['unit'] = replace_years(var['unit'], year)

        if 'short_unit' in var:
            var['short_unit'] = replace_years(var['short_unit'], year)

        for k in ["name", "unit", "short_unit"]:
            if var.get('display', {}).get('unit'):
                var['display']['unit'] = replace_years(var['display']['unit'], year)

            if var.get('display', {}).get('short_unit'):
                var['display']['short_unit'] = replace_years(var['display']['short_unit'], year)            

In [43]:
with open(yaml_path, "w") as f:
    ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)

## Update Sources

In [3]:
with open('wdi.sources.json', 'r') as f:
    sources = json.load(f)

sources = [s for s in sources if not s["name"].startswith("TODO")]

missing_sources = list(set(df_vars['source']) - {s['rawName'] for s in sources})
missing_sources

['International Labour Organization. “ILO modelled estimates database” ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.',
 'UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.',
 'International Labour Organization. “Labour Force Statistics database (LFS)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.',
 'World Bank, World Development Indicators database. Estimates are based on data obtained from International Labour Organization, ILOSTAT at https://ilostat.ilo.org/data/.',
 "World Bank staff estimates using the World Bank's total population and age/sex distributions of the United Nations Population Division's World Population Prospects: 2022 Revision.",
 "World Bank staff estimates based on age distributions of United Nations Population Division's World Population Prospects: 2022 Revision.",
 'International Labour Organization. “Education and Mismatch Indicators database (EMI)” ILOSTAT. Accessed December 6, 2022. https://ilos

In [4]:
print(json.dumps([{
    "rawName": rn,
    "name": f"TODO {rn}",
    "dataPublisherSource": "TODO",
}for rn in missing_sources], ensure_ascii=False))

[{"rawName": "International Labour Organization. “ILO modelled estimates database” ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.", "name": "TODO International Labour Organization. “ILO modelled estimates database” ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.", "dataPublisherSource": "TODO"}, {"rawName": "UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.", "name": "TODO UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.", "dataPublisherSource": "TODO"}, {"rawName": "International Labour Organization. “Labour Force Statistics database (LFS)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.", "name": "TODO International Labour Organization. “Labour Force Statistics database (LFS)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.", "dataPublisherSource": "TODO"}, {"rawName": "World Bank, World Development Indicators database. Estimates are based on data obtained fro

In [9]:
import openai
import random

SYSTEM_PROMPT = f"""
You are given list of examples in JSON format you should use for learning. Each example has 
rawName and fields name and dataPublisherSource are derived from rawName.
I'll give you a list of rawNames and you should give me a JSON list of those
rawNames with name and dataPublisherSource fields filled in.

Examples:
{json.dumps(random.sample(sources, 10))}
"""

all_sources = "\n".join(missing_sources[:10])

messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT,
    },
    {
        "role": "user",
        "content": all_sources,
    },
]

# 10 missing sources / 5 examples -> 2min

response = openai.ChatCompletion.create(
    model="gpt-4",
    # model="gpt-3.5-turbo",
    temperature=0,
    messages=messages,
)
print(f"Cost GPT4: ${response['usage']['total_tokens'] / 1000 * 0.03:.2f}")
r = json.loads(response['choices'][0]['message']["content"])
r

Cost GPT4: $0.06


[{'rawName': 'International Labour Organization. “ILO modelled estimates database” ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.',
  'name': 'International Labour Organization (via World Bank)',
  'dataPublisherSource': 'ILO modelled estimates database - ILOSTAT'},
 {'rawName': 'UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.',
  'name': 'UNESCO Institute for Statistics (via World Bank)',
  'dataPublisherSource': 'UNESCO Institute for Statistics'},
 {'rawName': 'International Labour Organization. “Labour Force Statistics database (LFS)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.',
  'name': 'International Labour Organization (via World Bank)',
  'dataPublisherSource': 'Labour Force Statistics database - ILOSTAT'},
 {'rawName': 'World Bank, World Development Indicators database. Estimates are based on data obtained from International Labour Organization, ILOSTAT at https://ilostat.ilo.org/data/.',
  'name': 'World Ba

In [10]:
r

[{'rawName': 'International Labour Organization. “ILO modelled estimates database” ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.',
  'name': 'International Labour Organization (via World Bank)',
  'dataPublisherSource': 'ILO modelled estimates database - ILOSTAT'},
 {'rawName': 'UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.',
  'name': 'UNESCO Institute for Statistics (via World Bank)',
  'dataPublisherSource': 'UNESCO Institute for Statistics'},
 {'rawName': 'International Labour Organization. “Labour Force Statistics database (LFS)” ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.',
  'name': 'International Labour Organization (via World Bank)',
  'dataPublisherSource': 'Labour Force Statistics database - ILOSTAT'},
 {'rawName': 'World Bank, World Development Indicators database. Estimates are based on data obtained from International Labour Organization, ILOSTAT at https://ilostat.ilo.org/data/.',
  'name': 'World Ba

In [11]:
json.dumps(r)

'[{"rawName": "International Labour Organization. \\u201cILO modelled estimates database\\u201d ILOSTAT. Accessed June 2021. https://ilostat.ilo.org/data/.", "name": "International Labour Organization (via World Bank)", "dataPublisherSource": "ILO modelled estimates database - ILOSTAT"}, {"rawName": "UNESCO Institute for Statistics (http://uis.unesco.org/). Data as of March 2021.", "name": "UNESCO Institute for Statistics (via World Bank)", "dataPublisherSource": "UNESCO Institute for Statistics"}, {"rawName": "International Labour Organization. \\u201cLabour Force Statistics database (LFS)\\u201d ILOSTAT. Accessed December 6, 2022. https://ilostat.ilo.org/data/.", "name": "International Labour Organization (via World Bank)", "dataPublisherSource": "Labour Force Statistics database - ILOSTAT"}, {"rawName": "World Bank, World Development Indicators database. Estimates are based on data obtained from International Labour Organization, ILOSTAT at https://ilostat.ilo.org/data/.", "name": "