## Update metadata

Update `wdi.meta.yml` from WDI metadata file. This notebook is intended to be run manually and all changes to the YAML file need to be verified.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from owid.catalog import Dataset
from etl.paths import DATA_DIR
import os

version = os.getcwd().rsplit('/', 1)[1]
ds_meadow = Dataset(DATA_DIR / 'meadow/worldbank_wdi' / version / 'wdi')
tb = ds_meadow['wdi']
indicator_codes = [tb[col].m.title for col in tb.columns]

In [3]:
%load_ext autoreload
%autoreload 2

from wdi import load_variable_metadata

df_vars = load_variable_metadata(indicator_codes)
df_vars.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[2m2025-01-27 07:04:32[0m [[32m[1minfo     [0m] [1mwdi.missing_metadata          [0m [36mn_indicators[0m=[35m79[0m


Unnamed: 0_level_0,topic,indicator_name,short_definition,long_definition,unit_of_measure,periodicity,base_period,other_notes,aggregation_method,limitations_and_exceptions,notes_from_original_source,general_comments,source,statistical_concept_and_methodology,development_relevance,related_source_links,other_web_links,related_indicators,license_type,unit
indicator_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ag_con_fert_pt_zs,Environment: Agricultural production,Fertilizer consumption (% of fertilizer produc...,,Fertilizer consumption measures the quantity o...,,Annual,,The world and regional aggregate series do not...,Weighted average,The FAO has revised the time series for fertil...,,,"Food and Agriculture Organization, electronic ...",Fertilizer consumption measures the quantity o...,"Factors such as the green revolution, has led ...",,,,CC BY-4.0,
ag_con_fert_zs,Environment: Agricultural production,Fertilizer consumption (kilograms per hectare ...,,Fertilizer consumption measures the quantity o...,,Annual,,The world and regional aggregate series do not...,Weighted average,The FAO has revised the time series for fertil...,,,"Food and Agriculture Organization, electronic ...",Fertilizer consumption measures the quantity o...,"Factors such as the green revolution, has led ...",,,,CC BY-4.0,
ag_lnd_agri_k2,Environment: Land use,Agricultural land (sq. km),,Agricultural land refers to the share of land ...,,Annual,,Areas of former states are included in the suc...,Sum,The data are collected by the Food and Agricul...,,,"Food and Agriculture Organization, electronic ...",Agricultural land constitutes only a part of a...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
ag_lnd_agri_zs,Environment: Land use,Agricultural land (% of land area),,Agricultural land refers to the share of land ...,,Annual,,Areas of former states are included in the suc...,Weighted average,The data are collected by the Food and Agricul...,,,"Food and Agriculture Organization, electronic ...",Agriculture is still a major sector in many ec...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,
ag_lnd_arbl_ha,Environment: Land use,Arable land (hectares),,Arable land (in hectares) includes land define...,,Annual,,,,The Food and Agriculture Organization (FAO) tr...,,,"Food and Agriculture Organization, electronic ...",Temporary fallow land refers to land left fall...,Agricultural land covers more than one-third o...,,,,CC BY-4.0,


## Replace years in YAML metadata

In [4]:
from etl.files import ruamel_dump, ruamel_load

yaml_path = "wdi.meta.yml"

with open(yaml_path, "r") as f:
    yml = ruamel_load(f)

In [5]:
KEEP = {'armed_forces_share_population'}

# Delete variables that are not in the dataset
missing_variables = set(yml['tables']['wdi']['variables'].keys()) - set(tb.columns)
missing_variables = {v for v in missing_variables if not v.startswith('omm_') and v not in KEEP}

print(f"Deleting {len(missing_variables)} variables")
for var in missing_variables:
    del yml['tables']['wdi']['variables'][var]

Deleting 0 variables


In [6]:
import re
from typing import Union


def replace_years(s: str, year: Union[int, str]) -> str:
    """replaces all years in string with {year}.

    Example:

        >>> replace_years("GDP (constant 2010 US$)", 2015)
        "GDP (constant 2015 US$)"
    """
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    s_new = year_regex.sub(str(year), s)
    return s_new


variables = yml["tables"]["wdi"]["variables"]

for indicator_code in df_vars.index:
    if indicator_code in variables:
        var = variables[indicator_code]
    else:
        var = {}
        variables[indicator_code] = var

    # update titles from metadata file
    try:
        var["title"] = df_vars.loc[indicator_code].indicator_name
    except KeyError:
        continue

    # if title contains year, try to update units too
    year_regex = re.compile(r"\b([1-2]\d{3})\b")
    regex_res = year_regex.search(df_vars.loc[indicator_code].indicator_name)
    if regex_res:
        assert len(regex_res.groups()) == 1
        year = regex_res.groups()[0]

        if "unit" in var:
            var["unit"] = replace_years(var["unit"], year)

        if "short_unit" in var:
            var["short_unit"] = replace_years(var["short_unit"], year)

        for k in ["name", "unit", "short_unit"]:
            if var.get("display", {}).get("unit"):
                var["display"]["unit"] = replace_years(var["display"]["unit"], year)

            if var.get("display", {}).get("short_unit"):
                var["display"]["short_unit"] = replace_years(var["display"]["short_unit"], year)

        if "presentation" in var:
            for k in ["title_public", "title_variant"]:
                if k in var["presentation"]:
                    var["presentation"][k] = replace_years(var["presentation"][k], year)

In [7]:
with open(yaml_path, "w") as f:
    f.write(ruamel_dump(yml))

## Replace years in chart configs

In [13]:
from etl.db import get_engine, read_sql

# get GDP variable
q = """
select id from variables
where name = 'GDP per capita, PPP (constant 2021 international $)'
    and catalogPath = 'grapher/worldbank_wdi/2025-01-24/wdi/wdi#ny_gdp_pcap_pp_kd'
"""
engine = get_engine()
var_id = read_sql(q, engine).id.iloc[0]
print(var_id)

# get all charts using that variable
q = f"""
select chartId from chart_dimensions where variableId = {var_id};
"""
chart_ids = list(read_sql(q, engine)['chartId'])
len(chart_ids)

1008339


157

In [38]:
from apps.chart_sync.admin_api import AdminAPI
from etl.config import OWID_ENV, ENV_GRAPHER_USER_ID

admin_api = AdminAPI(OWID_ENV, grapher_user_id=ENV_GRAPHER_USER_ID)

old_year = "2017"
new_year = "2021"

for chart_id in chart_ids:
    chart_config = admin_api.get_chart_config(chart_id)

    fields = ['subtitle', 'note']

    update = False
    for field in fields:
        if field in chart_config:
            if old_year in (chart_config.get(field, '') or ''):
                chart_config[field] = chart_config[field].replace(old_year, new_year)
                update = True

    if update:
        print(f"Updating chart {chart_id}")
        admin_api.update_chart(chart_id, chart_config)

Updating chart 150
Updating chart 179
Updating chart 185
Updating chart 212
Updating chart 225
Updating chart 251
Updating chart 340
Updating chart 374
Updating chart 449
Updating chart 450
Updating chart 509
Updating chart 517
Updating chart 558
Updating chart 587
Updating chart 590
Updating chart 629
Updating chart 663
Updating chart 686
Updating chart 705
Updating chart 711
Updating chart 736
Updating chart 773
Updating chart 781
Updating chart 782
Updating chart 784
Updating chart 826
Updating chart 836
Updating chart 838
Updating chart 841
Updating chart 842
Updating chart 853
Updating chart 855
Updating chart 856
Updating chart 936
Updating chart 937
Updating chart 971
Updating chart 979
Updating chart 980
Updating chart 1005
Updating chart 1037
Updating chart 1144
Updating chart 1235
Updating chart 1250
Updating chart 1254
Updating chart 1354
Updating chart 1355
Updating chart 1367
Updating chart 1450
Updating chart 1461
Updating chart 1463
Updating chart 1496
Updating chart 149

## Update Sources

In [7]:
import json

with open("wdi.sources.json", "r") as f:
    sources = json.load(f)

sources = [s for s in sources if not s["name"].startswith("TODO")]

missing_sources = list(set(df_vars["source"]) - {s["rawName"] for s in sources})
missing_sources

[]

In [8]:
import os
from openai import OpenAI
import random

SYSTEM_PROMPT = f"""
You are given list of examples in JSON format you should use for learning. Each example has
rawName and fields name and dataPublisherSource are derived from rawName.
I'll give you a list of rawNames and you should give me a JSON list of those
rawNames with name and dataPublisherSource fields filled in.

Examples:
{json.dumps(random.sample(sources, 20))}
"""

all_sources = "\n\n".join(missing_sources)

messages = [
    {
        "role": "system",
        "content": SYSTEM_PROMPT,
    },
    {
        "role": "user",
        "content": all_sources,
    },
]

client = OpenAI()

# 10 missing sources / 5 examples -> 2min
response = client.chat.completions.create(
    model="gpt-4o",
    temperature=0,
    messages=messages,
    response_format={"type": "json_object"},
)
print(f"Cost GPT4o: ${response.usage.total_tokens / 1e6 * 7.5:.2f}")
r = json.loads(response.choices[0].message.content)
print(json.dumps(r, ensure_ascii=False, indent=2))

Cost GPT4o: $0.01
{
  "rawName": "World Bank's World Development Indicators.",
  "name": "World Bank",
  "dataPublisherSource": "World Development Indicators - World Bank"
}
