In [129]:
import json
from pathlib import Path

import pandas as pd
from openai import OpenAI
from tqdm import tqdm

In [130]:
client = OpenAI()

In [131]:
indicators = pd.read_csv('wdi.csv')
indicators.head()

Unnamed: 0,id,name,sourceNote
0,AG.CON.FERT.ZS,Fertilizer consumption (kilograms per hectare ...,Fertilizer consumption measures the quantity o...
1,AG.LND.AGRI.ZS,Agricultural land (% of land area),Agricultural land refers to the share of land ...
2,AG.LND.ARBL.HA.PC,Arable land (hectares per person),Arable land (hectares per person) includes lan...
3,AG.LND.ARBL.ZS,Arable land (% of land area),Arable land includes land defined by the FAO a...
4,AG.LND.CREL.HA,Land under cereal production (hectares),Land under cereal production refers to harvest...


In [132]:
def paraphrase_indicator(
    name,
    source_note=None,
    model='gpt-4o-mini',
) -> str:
    """Paraphrase a World Bank indicator name into simpler and more understandable terms.

    Parameters
    ----------
    name : str
        The World Bank indicator name to paraphrase.
    source_note : str, optional
        Additional context about the indicator name.
    model : str, optional
        The OpenAI model to use for generating the paraphrase.

    Returns
    -------
    str
        The paraphrased version of the indicator name.

    """
    system_prompt = """You are a helpful assistant that paraphrases technical terms. Rewrite the following World Bank indicator name using the context provided in the additional description.

    Return five (5) clear, concise paraphrases of the indicator name in a semicolon-delimited list.

    Focus on incorporating terms in brackets into the phrase e.g., "(% of fertilizer production)" to "as a percentage of fertilizer production".

    Transform the ungrammatical forms into grammatically correct phrases.

    Do not include additional information from the additional description, but use this context to clarify the meaning.

    It is imperative that paraphrases are faithful to the precise meaning of the original.

    Do not make the paraphrases overly formal, verbose, or cumbersome, but ensure that meaning is preserved.

    Do not capitalize the first letter of the paraphrases unless it is a proper noun or an acronym.
    """

    user_prompt = f"""Indicator name: {name}

    Additional description of indicator: {source_note}
    """

    client = OpenAI()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                'role': 'system',
                'content': system_prompt,
            },
            {
                'role': 'user',
                'content': user_prompt,
            },
        ],
        temperature=0.2,
        max_tokens=90,
    )
    return response.choices[0].message.content

In [None]:
# sample = indicators.sample(3).to_dict(orient='records')
sample = indicators.to_dict(orient='records')

paraphrases = []
pbar = tqdm(sample, desc='Paraphrasing indicators', leave=True)

for s in pbar:
    pbar.set_description_str(s['id'])
    p = paraphrase_indicator(s['name'], s['description'], model='gpt-4o-mini')
    paraphrases.append({'id': s['id'], 'name': s['name'], 'description': s['description'], 'paraphrase': p})


VC.IHR.PSRC.P5: 100%|██████████| 296/296 [07:48<00:00,  1.58s/it]      


In [134]:
# Make paraphrases a dict where the key is the indicator ID
paraphrases = {paraphrase['id']: [p.strip() for p in paraphrase['paraphrase'].split(';')] for paraphrase in paraphrases}

with Path('indicator_paraphrases.json').open('w') as f:
    json.dump(paraphrases, f, indent=2)