In [62]:
import json
from pathlib import Path

import pandas as pd
from openai import OpenAI
from tqdm import tqdm

In [63]:
indicators = pd.read_csv('wdi.csv')

# Remove any rows where the indicator id contains any of the specified suffixes after splitting on '.'
suffixes = {
    'PC',  # Per Capita
    'CD',  # Current US$
    'XD',  # Constant base
    'ZS',  # Percentage of population
    'PP',  # PPP
    'P6',  # Per million
    'P5',
    'P2',
}
indicators = indicators[~indicators['id'].apply(lambda x: any(suffix in x.split('.') for suffix in suffixes))]

indicators.sample(20)

Unnamed: 0,id,name,description
228,SI.SPR.PCAP.ZG,Annualized average growth rate in per capita r...,The growth rate in the welfare aggregate of th...
112,IQ.CPA.PUBS.XQ,CPIA public sector management and institutions...,The public sector management and institutions ...
4,AG.LND.CREL.HA,Land under cereal production (hectares),Land under cereal production refers to harvest...
28,CM.MKT.TRNR,"Stocks traded, turnover ratio of domestic shar...",Turnover ratio is the value of domestic shares...
108,IP.PAT.NRES,"Patent applications, nonresidents",Patent applications are worldwide patent appli...
221,SI.POV.DDAY,Poverty headcount ratio at $2.15 a day (2017 P...,Poverty headcount ratio at $2.15 a day is the ...
23,CM.MKT.INDX.ZG,S&P Global Equity Indices (annual % change),S&P Global Equity Indices measure the U.S. dol...
262,SP.DYN.IMRT.IN,"Mortality rate, infant (per 1,000 live births)",Infant mortality rate is the number of infants...
57,EN.FSH.THRD.NO,"Fish species, threatened","Fish species are based on Froese, R. and Pauly..."
59,EN.MAM.THRD.NO,"Mammal species, threatened",Mammal species are mammals excluding whales an...


In [64]:
print(len(indicators))

94


In [65]:
def paraphrase_indicator(
    name,
    source_note=None,
    model='gpt-4o-mini',
) -> str:
    """Paraphrase a World Bank indicator name into simpler and more understandable terms.

    Parameters
    ----------
    name : str
        The World Bank indicator name to paraphrase.
    source_note : str, optional
        Additional context about the indicator name.
    model : str, optional
        The OpenAI model to use for generating the paraphrase.

    Returns
    -------
    str
        The paraphrased version of the indicator name.

    """
    system_prompt = """You are a helpful assistant that paraphrases World Bank indicator names using the context provided in the additional description.

    Return exactly three (3) clear, concise **noun phrases** that faithfully represent the meaning of the original indicator name. Output them as a semicolon-delimited list.

    These noun phrases will be inserted into questions like:

    - "Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?"
    - "Was the average <paraphrased indicator name> in Northern America higher or lower than the value for Ghana in 2020?"
    - "What was the <paraphrased indicator name> in 2020 for the country with the highest value in South Asia?"
    - "Did <country> have a higher <paraphrased indicator name> than <other_country> in 2020?"

    Write the paraphrases **as if a person were using them to ask a question like the ones above**. Make them sound **natural and conversational**, like something someone would realistically say or hear, without compromising technical accuracy.

    Follow these guidelines:
    - Make all outputs concise, grammatical, easy to understand and **suitable for inserting into questions** like these.
    - Compress the phrase into the **shortest possible form** while retaining the meaning.
    - Do not use the words **total** or **average** in the paraphrase as this will interfere with the grammar of the wider questions.
    - Include bracketed elements, e.g., "(% of GDP)" as natural language phrases, such as "as a percentage of GDP".
    - **Do not include units of measurement**, e.g., "in US dollars", or "in TEUs".
    - Avoid embellished and abstract language, or esoteric terms. If an indicator name is very simple (e.g., 'rural population', 'net migration', 'surface area'), use that as one of the three paraphrases.
    - **Only capitalize proper nouns or acronyms**. Even though these are noun phrases, they will be inserted into the middle of sentences.
    - Use the additional description only to **clarify meaning**, not to add new information.
    - To repeat, paraphrases should be **noun phrases**. Start the phrase with something like 'count of', 'number of', 'percentage of', 'area of', 'rate of' if you are not sure how to begin.

    Reminder: preserve the meaning of the original indicator name; shorten as much as possible; and do not use unusual phrasing.
    """

    user_prompt = f"""Indicator name: {name}

    Additional description of indicator: {source_note}
    """

    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                'role': 'system',
                'content': system_prompt,
            },
            {
                'role': 'user',
                'content': user_prompt,
            },
        ],
        temperature=0,
        max_tokens=90,
    )
    return response.choices[0].message.content

In [66]:
# sample = indicators.sample(20, random_state=10).to_dict(orient='records')
sample = indicators.to_dict(orient='records')

model = 'gpt-4.1'

all_paraphrases = []
pbar = tqdm(sample, desc='Paraphrasing indicators', leave=True)

for s in pbar:
    pbar.set_description_str(s['id'])
    model_paraphrases = paraphrase_indicator(s['name'], s['description'], model=model)
    out = {
        'id': s['id'],
        'name': s['name'],
        'description': s['description'],
        'paraphrase': [p.strip() for p in model_paraphrases.split(';')],
    }
    all_paraphrases.append(out)


SP.UWT.TFRT: 100%|██████████| 94/94 [01:28<00:00,  1.07it/s]      


In [67]:
new_file_name = f'indicator_paraphrases_{model}.json'

with Path(new_file_name).open('w') as f:
    json.dump(all_paraphrases, f, indent=2)

In [68]:
def get_all_paraphrase_lengths(paraphrase_file):
    with Path(paraphrase_file).open('r') as f:
        data = json.load(f)
    lengths = []
    for entry in data:
        for p in entry.get('paraphrase', []):
            lengths.append(len(p.split(' ')))
    return lengths


for file in list(Path('.').iterdir()):
    if file.name.startswith('indicator_paraphrases') and file.name.endswith('.json'):
        lengths = get_all_paraphrase_lengths(file.name)
        avg_len = sum(lengths) / len(lengths) if lengths else 0
        print(f'Average paraphrase length in {file.name}: {avg_len:.2f} words')

Average paraphrase length in indicator_paraphrases_gpt-4.1-mini.json: 4.99 words
Average paraphrase length in indicator_paraphrases_gpt-4.1.json: 5.25 words
Average paraphrase length in indicator_paraphrases.json: 4.99 words
