In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

The input dataset is a long-format dataframe, with headings including type (type of test, like ‘uses’ for alternate uses task), src (a dataset id), question (a long version of the prompt that the participant responded to), prompt (a short version of the prompt), response (the participant’s input), id, and language (3-character ISO). For example:

In [3]:
root_dir = '/mnt/chromeos/GoogleDrive/MyDrive/Projects/ocsai-py/data/'
root_dir = Path(root_dir)
data = pd.read_csv(root_dir / 'ocsai-all.csv')
data.sample(5)

  data = pd.read_csv(root_dir / 'ocsai-all.csv')


Unnamed: 0,type,src,question,prompt,response,id,target,participant,response_num,language,rater_count,rating_std,dupe_control,participant_list,default_split,prompt_split,lang_split,type_split
17024,uses,multiaut_chinese1,领带的一个令人惊讶的用途是什么？,领带,束紧带,multiaut_chinese1_领带-eba2e8,2.6,multiaut_chinese11071,,chi,4,0.0,束紧带,['multiaut_chinese11071'],train,train,val,train
80346,uses,multiaut_german3,Was ist eine überraschende Verwendung für eine...,Schrank,mit versteckter hintertür als eingang zu einem...,multiaut_german3_Schrank-279d68,3.3,multiaut_german331,49.0,ger,3,0.57735,als einem eingang geheimort hintertur mit vers...,['multiaut_german331'],test,test,test,train
29472,consequences,h18/setal08,What would be a surprising consequence if PEOP...,no sleep,another meal time introduced other than lunch ...,4bb0376bfd01474e6374e4597e46a92c,2.7,COMBINED_4bb0376bfd01474e6374e4597e46a92c,,eng,6,1.527525,another dinner etc introduced lunch meal other...,"['setal08176', 'h18176']",train,val,,test
32198,instances,motesf,What is a surprising example of something SMELLY?,smelly,Dogs in mud.,motesf_smelly-52279b,3.0,motesf6f4922,5.0,eng,5,0.0,dogs in mud,['motesf6f4922'],train,train,,test
21643,uses,multiaut_dutch1,Wat is een verrassend gebruik voor een VORK?,vork,veters uit de knoop halen,multiaut_dutch1_vork-3c34bc,3.8,multiaut_dutch1718,,dut,2,0.707107,de halen knoop uit veters,['multiaut_dutch1718'],train,train,val,train


In [4]:
# Top Languages
data.language.value_counts().index.to_list()

['eng', 'chi', 'ger', 'ita', 'pol', 'dut', 'fre', 'rus', 'ara', 'heb', 'spa']

Step 1 is to translate all unique prompt/question pairs to the selected translation languages, and save that key for lookup later. One way of doing so is to prepare a JSON in the following format, where the values are null for all cases except the original phrase. e.g.

```
[
{’lang’: ‘eng’, ‘question’: null, ‘prompt’: null }
{’lang’: ‘ara’, ‘question’: null, ‘prompt’: null }
{’lang’: ‘pol’, ‘question’: ‘QUESTION TEXT TO TRANSLATE’, ‘prompt’: ‘PROMPT TEXT TO TRANSLATE’ }
]
```

Then a language model is asked to fill in all the *other* values.

That data is loaded as a DataFrame. This step is done so that the questions are consistent, by not being translated over and over again.

In [5]:
import json

In [7]:
languages = ['eng', 'chi', 'ger', 'ita', 'pol', 'dut', 'fre', 'rus', 'ara', 'heb', 'spa']

unique_pairs = data[['language', 'question', 'prompt']].drop_duplicates().reset_index(drop=True)

for index, row in unique_pairs.iterrows():
    # if select 1 random l
    other_langs = [l for l in languages if l != row['language']]
    selected_langs = [row['language']]
    if row['language'] == 'eng':
        selected_langs += list(np.random.choice(other_langs, 2))
    else:
        selected_langs += ['eng', np.random.choice(other_langs[1:])]

    translation_template = []
    for lang in selected_langs:
        if lang == row['language']:
            item = row.to_dict()
        else:
            item = {'language': lang,
                    'question': None,
                    'prompt': None
                    }
        translation_template.append(item)
        
    #print(json.translation_template)
    #translations[languages.index(original_lang)]['question'] = row['question']
    #translations[languages.index(original_lang)]['prompt'] = row['prompt']
translation_template

[{'language': 'spa',
  'question': '¿Cuál es un uso sorprendente para un LADRILLO?',
  'prompt': 'ladrillo'},
 {'language': 'eng', 'question': None, 'prompt': None},
 {'language': 'pol', 'question': None, 'prompt': None}]

In [8]:
import textwrap
from ocsai.prompt.utils import strip_backticks
from ocsai.utils import generic_llm
import anthropic
client = anthropic.Anthropic()

In [14]:
def translate_question(questions_dict,
                       model:str='claude-3-sonnet-20240229',
                       temperature:float=0.0,
                       max_tokens:int=4000,
                       ):
    qstr = json.dumps(questions_dict, indent=2, ensure_ascii=False)
    prompt = textwrap.dedent(f"""
    The following JSON of translated creativity test questions is missing 
    information. Translate the fields with `null` into the specified language,
    translating from the first item. Respond with the full JSON object, wrapped in triple backticks.

    Input data:
    ```
    {qstr}
    ```

    """).strip()
    
    content = generic_llm(
        prompt,
        sysmsg="You translate the missing fields in a JSON object into various languages.",
        model=model,
        client=client,
        temperature=temperature,
        max_tokens=max_tokens
    )
    return content


#response = translate_question(translation_template)
#translated = json.loads(strip_backticks(response))
translated


[{'language': 'spa',
  'question': '¿Cuál es un uso sorprendente para un LADRILLO?',
  'prompt': 'ladrillo'},
 {'language': 'eng',
  'question': 'What is a surprising use for a BRICK?',
  'prompt': 'brick'},
 {'language': 'pol',
  'question': 'Jakie jest zaskakujące zastosowanie CEGŁY?',
  'prompt': 'cegła'}]

In [9]:
translation_template = []
for index, row in unique_pairs.iterrows():
    translations = [{'lang': lang, 'question': None, 'prompt': None} for lang in languages]
    original_lang = row['question']
    translations[languages.index(original_lang)]['question'] = row['question']
    translations[languages.index(original_lang)]['prompt'] = row['prompt']
    translation_template.append(translations)

# Convert to DataFrame for further processing
translation_df = pd.DataFrame(translation_template)

import json
# Save to JSON file
#with open('translation_template.json', 'w') as file:
#    json.dump(translation_template, file, ensure_ascii=False, indent=4)

# Display the prepared DataFrame
#import ace_tools as tools; tools.display_dataframe_to_user(name="Unique Prompt/Question Pairs for Translation", dataframe=unique_pairs)

ValueError: 'ما هو الاستخدام المفاجئ لـ علب الصفيح؟' is not in list

In [None]:
other_langs = [l for l in languages if l != row['language']]
    selected_langs = [row['language']]
    if row['language'] == 'eng':
        selected_langs += list(np.random.choice(other_langs, 2))
    else:
        selected_langs += ['eng', np.random.choice(other_langs[1:])]