In [34]:
import json
from pathlib import Path

import pandas as pd
from openai import OpenAI
from tqdm import tqdm

In [35]:
client = OpenAI()

In [36]:
indicators = pd.read_csv('wdi.csv')

# Remove any rows where the indicator id contains any of the specified suffixes after splitting on '.'
suffixes = {
    'PC',  # Per Capita
    'CD',  # Current US$
    'XD',  # Constant base
    'ZS',  # Percentage of population
    'PP',  # PPP
    'P6',  # Per million
    'P5',
    'P2',
}
indicators = indicators[~indicators['id'].apply(lambda x: any(suffix in x.split('.') for suffix in suffixes))]

indicators.sample(20)

Unnamed: 0,id,name,description
258,SP.ADO.TFRT,"Adolescent fertility rate (births per 1,000 wo...",Adolescent fertility rate is the number of bir...
196,SH.IMM.IDPT,"Immunization, DPT (% of children ages 12-23 mo...","Child immunization, DPT, measures the percenta..."
110,IQ.CPA.ECON.XQ,CPIA economic management cluster average (1=lo...,The economic management cluster includes macro...
238,SL.GDP.PCAP.EM.KD,GDP per person employed (constant 2021 PPP $),GDP per person employed is gross domestic prod...
112,IQ.CPA.PUBS.XQ,CPIA public sector management and institutions...,The public sector management and institutions ...
28,CM.MKT.TRNR,"Stocks traded, turnover ratio of domestic shar...",Turnover ratio is the value of domestic shares...
260,SP.DYN.CDRT.IN,"Death rate, crude (per 1,000 people)",Crude death rate indicates the number of death...
259,SP.DYN.CBRT.IN,"Birth rate, crude (per 1,000 people)",Crude birth rate indicates the number of live ...
255,SM.POP.NETM,Net migration,Net migration is the net total of migrants dur...
122,LP.EXP.DURS.MD,"Lead time to export, median case (days)",Lead time to export is the median time (the va...


In [37]:
print(len(indicators))

94


In [38]:
def paraphrase_indicator(
    name,
    source_note=None,
    model='gpt-4o-mini',
) -> str:
    """Paraphrase a World Bank indicator name into simpler and more understandable terms.

    Parameters
    ----------
    name : str
        The World Bank indicator name to paraphrase.
    source_note : str, optional
        Additional context about the indicator name.
    model : str, optional
        The OpenAI model to use for generating the paraphrase.

    Returns
    -------
    str
        The paraphrased version of the indicator name.

    """
    system_prompt = """You are a helpful assistant that paraphrases technical terms. Rewrite the following World Bank indicator name using the context provided in the additional description.

    Return five (5) clear, concise paraphrases of the indicator name in a semicolon-delimited list.

    Focus on incorporating terms in brackets into the phrase e.g., "(% of fertilizer production)" to "as a percentage of fertilizer production".

    Transform the ungrammatical forms into grammatically correct phrases.

    Create noun phrases of the indicator name. These names will go on to be included as part of a question incorporating that name, such as:

    - Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?
    - Was the <paraphrased indicator name> in the United States higher or lower than the world average in 2020?

    Do not include additional information from the additional description, but use this context to clarify the meaning.

    It is imperative that paraphrases are faithful to the precise meaning of the original.

    Do not make the paraphrases overly formal, verbose, or cumbersome, but ensure that meaning is preserved.

    Do not capitalize the first letter of the paraphrases unless it is a proper noun or an acronym.
    """

    alt_system_prompt = """You are a helpful assistant that paraphrases World Bank indicator names using the context provided in the additional description.

    Return exactly five (5) clear, concise **noun phrases** that faithfully represent the meaning of the original indicator name. Output them as a semicolon-delimited list.

    These noun phrases will be inserted into questions like:

    - "Which country in Eastern Europe had the highest <paraphrased indicator name> in 2020?"
    - "Was the <paraphrased indicator name> in the United States higher or lower than the world average in 2020?"

    Write the paraphrases **as if a person were using them to ask a question like the ones above**. Make them sound **natural and conversational**, like something someone would realistically say or hear, without compromising technical accuracy.

    Follow these guidelines:
    - Make all outputs grammatical and readable
    - Compress the phrase into the **shortest possible form** while retaining the meaning
    - Convert technical shorthand into natural expressions
    - Include bracketed elements, e.g., "(% of GDP)" as natural language phrases, such as "as a percentage of GDP"
    - Avoid overly formal, fluffy, embellished, or abstract language.
    - **Only capitalize proper nouns or acronyms**. Even though these are noun phrases, they will be used in a sentence context.
    - Use the additional description only to **clarify meaning**, not to add new info

    It is crucial that all paraphrases preserve the precise meaning of the original.
    """

    user_prompt = f"""Indicator name: {name}

    Additional description of indicator: {source_note}
    """

    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                'role': 'system',
                'content': alt_system_prompt,
            },
            {
                'role': 'user',
                'content': user_prompt,
            },
        ],
        temperature=0.2,
        max_tokens=90,
    )
    return response.choices[0].message.content

In [None]:
sample = indicators.sample(10).to_dict(orient='records')
# sample = indicators.to_dict(orient='records')

model = 'gpt-4.1-mini'

paraphrases = []
pbar = tqdm(sample, desc='Paraphrasing indicators', leave=True)

for s in pbar:
    pbar.set_description_str(s['id'])
    p = paraphrase_indicator(s['name'], s['description'], model=model)
    out = {'id': s['id'], 'name': s['name'], 'description': s['description'], 'paraphrase': p}
    paraphrases.append(out)


SP.UWT.TFRT: 100%|██████████| 94/94 [02:15<00:00,  1.44s/it]      


In [None]:
# with Path('indicator_paraphrases.json').open('w') as f:
with Path(f'indicator_paraphrases_{model}.json').open('w') as f:
    json.dump(paraphrases, f, indent=2)

In [44]:
with open('indicator_paraphrases_gpt-4.1-mini.json') as f:
    shorter = json.load(f)

with open('indicator_paraphrases.json') as f:
    original = json.load(f)

In [46]:
# Print average paraphrase length for each indicator over both files
orig = []
short = []
for id, p in original.items():
    if id in shorter:
        original_length = [len(p_i.split()) for p_i in p]
        shorter_length = [len(p_i.split()) for p_i in shorter[id]]
        orig.append(original_length)
        short.append(shorter_length)

# Print average lengths
orig_avg = [sum(l) / len(l) for l in orig]
short_avg = [sum(l) / len(l) for l in short]
print(f'Original average length: {sum(orig_avg) / len(orig_avg)}')
print(f'Shorter average length: {sum(short_avg) / len(short_avg)}')


Original average length: 7.068085106382979
Shorter average length: 5.7042553191489365
