In [123]:
import json
from pathlib import Path

import pandas as pd
from openai import OpenAI
from tqdm import tqdm

In [124]:
client = OpenAI()

In [125]:
# models = client.models.list()
# models = sorted(models.data, key=lambda x: x.id)
# for model in models:
#     print(model.id)

In [142]:
indicators = pd.read_csv('wdi.csv')

# Remove any rows where the indicator id ends with '.ZS', '.PC', or '.P6', '.P5'
indicators = indicators[~indicators['id'].str.endswith('.PC')]
indicators = indicators[~indicators['id'].str.endswith('.ZS')]
indicators = indicators[~indicators['id'].str.endswith('.P6')]
indicators = indicators[~indicators['id'].str.endswith('.P5')]
indicators = indicators[~indicators['id'].str.endswith('.P2')]


indicators.sample(20)


Unnamed: 0,id,name,description
218,SI.DST.10TH.10,Income share held by highest 10%,Percentage share of income or consumption is t...
192,SH.DYN.MORT,"Mortality rate, under-5 (per 1,000 live births)",Under-five mortality rate is the probability p...
17,BN.CAB.XOKA.CD,"Current account balance (BoP, current US$)",Current account balance is the sum of net expo...
137,NY.GDP.PCAP.CD,GDP per capita (current US$),GDP per capita is gross domestic product divid...
53,EG.USE.PCAP.KG.OE,Energy use (kg of oil equivalent per capita),Energy use refers to use of primary energy bef...
225,SI.SPR.PC40,"Survey mean consumption or income per capita, ...",Mean consumption or income per capita (2017 PP...
143,NY.GNP.PCAP.CD,"GNI per capita, Atlas method (current US$)",GNI per capita (formerly GNP per capita) is th...
117,IS.SHP.GOOD.TU,Container port traffic (TEU: 20 foot equivalen...,Port container traffic measures the flow of co...
106,IE.PPN.ICTI.CD,Public private partnerships investment in ICT ...,Public Private Partnerships in ICT (current US...
256,SM.POP.TOTL,"International migrant stock, total",International migrant stock is the number of p...


In [143]:
print(len(indicators))

134


In [144]:
# Now keep only the rows where the indicator id has three dot-separated parts
indicators = indicators[indicators['id'].str.count(r'\.') <= 4]
indicators.sample(20)

Unnamed: 0,id,name,description
135,NY.GDP.MKTP.CD,GDP (current US$),GDP at purchaser's prices is the sum of gross ...
291,TX.VAL.MMTL.ZS.UN,Ores and metals exports (% of merchandise expo...,Ores and metals comprise the commodities in SI...
104,IE.PPI.TRAN.CD,Investment in transport with private participa...,Investment in transport projects with private...
44,EG.EGY.PRIM.PP.KD,Energy intensity level of primary energy (MJ/$...,Energy intensity level of primary energy is th...
122,LP.EXP.DURS.MD,"Lead time to export, median case (days)",Lead time to export is the median time (the va...
207,SH.STA.MMRT,"Maternal mortality ratio (modeled estimate, pe...",Maternal mortality ratio is the number of wome...
143,NY.GNP.PCAP.CD,"GNI per capita, Atlas method (current US$)",GNI per capita (formerly GNP per capita) is th...
36,DT.DOD.MWBG.CD,"IBRD loans and IDA credits (DOD, current US$)",IBRD loans and IDA credits are public and publ...
10,AG.LND.TOTL.K2,Land area (sq. km),"Land area is a country's total area, excluding..."
266,SP.DYN.TFRT.IN,"Fertility rate, total (births per woman)",Total fertility rate represents the number of ...


In [145]:
print(len(indicators))

133


In [128]:
def paraphrase_indicator(
    name,
    source_note=None,
    model='gpt-4o-mini',
) -> str:
    """Paraphrase a World Bank indicator name into simpler and more understandable terms.

    Parameters
    ----------
    name : str
        The World Bank indicator name to paraphrase.
    source_note : str, optional
        Additional context about the indicator name.
    model : str, optional
        The OpenAI model to use for generating the paraphrase.

    Returns
    -------
    str
        The paraphrased version of the indicator name.

    """
    system_prompt = """You are a helpful assistant that paraphrases technical terms. Rewrite the following World Bank indicator name using the context provided in the additional description.

    Return five (5) clear, concise paraphrases of the indicator name in a semicolon-delimited list.

    Focus on incorporating terms in brackets into the phrase e.g., "(% of fertilizer production)" to "as a percentage of fertilizer production".

    Transform the ungrammatical forms into grammatically correct phrases.

    Create noun phrases of the indicator name. These names will go on to be included as part of a question incorporating that name, such as:

    - Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?
    - Was the <paraphrased indicator name> in the United States higher or lower than the world average in 2020?

    Do not include additional information from the additional description, but use this context to clarify the meaning.

    It is imperative that paraphrases are faithful to the precise meaning of the original.

    Do not make the paraphrases overly formal, verbose, or cumbersome, but ensure that meaning is preserved.

    Do not capitalize the first letter of the paraphrases unless it is a proper noun or an acronym.
    """

    alt_system_prompt = """You are a helpful assistant that paraphrases World Bank indicator names using the context provided in the additional description.

    Return exactly five (5) clear, concise **noun phrases** that faithfully represent the meaning of the original indicator name. Output them as a semicolon-delimited list.

    These noun phrases will be inserted into questions like:

    - "Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?"
    - "Was the <paraphrased indicator name> in the United States higher or lower than the world average in 2020?"

    Follow these guidelines:
    - Make all outputs grammatical and readable
    - Convert technical shorthand into natural expressions
    - Include bracketed units (e.g., "(% of GDP)") as natural language phrases, such as "as a percentage of GDP"
    - Do not use overly formal or verbose language
    - **Only capitalize proper nouns or acronyms**. Even though these are noun phrases, they will be used in a sentence context.
    - Use the additional description only to **clarify meaning**, not to add new info

    It is crucial that all paraphrases preserve the precise meaning of the original.
    """

    alt_alt_system_prompt = """You are a helpful assistant that paraphrases World Bank indicator names using the context provided in the additional description.

    Return exactly five (5) clear, concise **noun phrases** that faithfully represent the meaning of the original indicator name. Output them as a semicolon-delimited list.

    These noun phrases will be used in questions like:

    - "Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?"
    - "Was the <paraphrased indicator name> in the United States higher or lower than the world average in 2020?"

    Write the paraphrases **as if a person were asking the question aloud**. Make them sound **natural and conversational**, like something someone would realistically say or hear, without compromising technical accuracy.

    Guidelines:
    - Use grammatically correct, human-sounding noun phrases
    - Preserve units if given but convert them into natural expressions
    - Convert technical shorthand into natural expressions
    - Incorporate bracketed units as plain phrases, e.g., "(% of GDP)" to "as a percentage of GDP", and "(current US$)" to "in current US dollars"
    - Do not include extra information from the additional description, but use it to clarify the original meaning
    - Avoid excessive formality, complexity, or repetition
    - Only capitalize proper nouns or acronyms

    It is crucial that all paraphrases preserve the precise meaning of the original.
"""

    user_prompt = f"""Indicator name: {name}

    Additional description of indicator: {source_note}
    """

    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                'role': 'system',
                'content': alt_alt_system_prompt,
            },
            {
                'role': 'user',
                'content': user_prompt,
            },
        ],
        temperature=0.2,
        max_tokens=90,
    )
    return response.choices[0].message.content

In [129]:
# sample = indicators.sample(10).to_dict(orient='records')
sample = indicators.to_dict(orient='records')

model = 'gpt-4.1-mini'

paraphrases = []
pbar = tqdm(sample, desc='Paraphrasing indicators', leave=True)

for s in pbar:
    pbar.set_description_str(s['id'])
    p = paraphrase_indicator(s['name'], s['description'], model=model)
    out = {'id': s['id'], 'name': s['name'], 'description': s['description'], 'paraphrase': p}
    paraphrases.append(out)


TX.VAL.TECH.CD: 100%|██████████| 134/134 [04:39<00:00,  2.08s/it]     


In [None]:
# Make paraphrases a dict where the key is the indicator ID
paraphrases = {paraphrase['id']: [p.strip() for p in paraphrase['paraphrase'].split(';')] for paraphrase in paraphrases}

with Path('indicator_paraphrases.json').open('w') as f:
    json.dump(paraphrases, f, indent=2)