## Update metadata

Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified.

In [1]:
# TODO: remove this
from myml.nbinit import *

In [2]:
from wdi import load_variable_metadata
df_vars = load_variable_metadata()

In [41]:
import ruamel.yaml

yaml_path = "wdi.meta.yml"

with open(yaml_path, "r") as f:
    yml = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)

In [42]:
import re
from typing import Union

def replace_years(s: str, year: Union[int, str]) -> str:
    """replaces all years in string with {year}.

    Example:

        >>> replace_years("GDP (constant 2010 US$)", 2015)
        "GDP (constant 2015 US$)"
    """
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    s_new = year_regex.sub(str(year), s)
    return s_new

variables = yml['tables']['wdi']['variables']

for indicator_code in variables.keys():
    var = variables[indicator_code]

    # update titles from metadata file
    try:
        var['title'] = df_vars.loc[indicator_code].indicator_name
    except KeyError:
        continue    

    # if title contains year, try to update units too
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)
    if regex_res:
        assert len(regex_res.groups()) == 1
        year = regex_res.groups()[0]

        if 'unit' in var:
            var['unit'] = replace_years(var['unit'], year)

        if 'short_unit' in var:
            var['short_unit'] = replace_years(var['short_unit'], year)

        for k in ["name", "unit", "short_unit"]:
            if var.get('display', {}).get('unit'):
                var['display']['unit'] = replace_years(var['display']['unit'], year)

            if var.get('display', {}).get('short_unit'):
                var['display']['short_unit'] = replace_years(var['display']['short_unit'], year)            

In [43]:
with open(yaml_path, "w") as f:
    ruamel.yaml.dump(yml, f, Dumper=ruamel.yaml.RoundTripDumper, width=120)

## Update Sources

In [31]:
with open('wdi.sources.json', 'r') as f:
    sources = json.load(f)

sources = [s for s in sources if not s["name"].startswith("TODO")]

missing_sources = set(df_vars['source']) - {s['rawName'] for s in sources}
missing_sources

{'(1) United Nations Population Division. World Population Prospects: 2022 Revision, or derived from male and female life expectancy at birth from sources such as: (2) Census reports and other statistical publications from national statistical offices, (3) Eurostat: Demographic Statistics, (4) United Nations Statistical Division. Population and Vital Statistics Reprot (various years), (5) U.S. Census Bureau: International Database, and (6) Secretariat of the Pacific Community: Statistics and Demography Programme.',
 '(1) United Nations Population Division. World Population Prospects: 2022 Revision. (2) Census reports and other statistical publications from national statistical offices, (3) Eurostat: Demographic Statistics, (4) United Nations Statistical Division. Population and Vital Statistics Reprot (various years), (5) U.S. Census Bureau: International Database, and (6) Secretariat of the Pacific Community: Statistics and Demography Programme.',
 '(1) United Nations Population Divis

In [30]:
json.dumps([{
    "rawName": rn,
    "name": f"TODO {rn}",
    "dataPublisherSource": "TODO",
}for rn in missing_sources])

'[{"rawName": "International Labour Organization. \\u201cILO modelled estimates database\\u201d ILOSTAT. Accessed January 2021. https://ilostat.ilo.org/data/.", "name": "TODO International Labour Organization. \\u201cILO modelled estimates database\\u201d ILOSTAT. Accessed January 2021. https://ilostat.ilo.org/data/.", "dataPublisherSource": "TODO"}, {"rawName": "(1) United Nations Population Division. World Population Prospects: 2022 Revision. (2) University of California, Berkeley, and Max Planck Institute for Demographic Research. The Human Mortality Database.", "name": "TODO (1) United Nations Population Division. World Population Prospects: 2022 Revision. (2) University of California, Berkeley, and Max Planck Institute for Demographic Research. The Human Mortality Database.", "dataPublisherSource": "TODO"}, {"rawName": "(1) United Nations Population Division. World Population Prospects: 2022 Revision. (2) Census reports and other statistical publications from national statistical 

In [20]:
import openai
import random

SYSTEM_PROMPT = f"""
You are given list of examples in JSON format you should use for learning. Each example has 
rawName and fields name and dataPublisherSource are derived from rawName.
I'll give you a list of rawNames and you should give me a JSON list of those
rawNames with name and dataPublisherSource fields filled in.

Examples:
{json.dumps(random.sample(sources, 10))}
"""

all_sources = "\n".join(list(missing_sources))

messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT,
    },
    {
        "role": "user",
        "content": all_sources,
    },
]

# 23 missing sources -> 

response = openai.ChatCompletion.create(
    model="gpt-4",
    # model="gpt-3.5-turbo",
    temperature=0,
    messages=messages,
)
print(f"Cost GPT4: ${response['usage']['total_tokens'] / 1000 * 0.03:.2f}")
json.loads(response['choices'][0]['message']["content"])