In [1]:
import requests
from openai import OpenAI
import os
import json
import re
from tqdm import tqdm
from datasets import load_dataset

client = OpenAI(api_key = '')

In [2]:
dataset = load_dataset('huseinzol05/malaysian-dialect-qa', split = 'test')

In [3]:
dataset_lang = load_dataset('huseinzol05/malaysian-dialect-qa-lang', split = 'test')

In [5]:
questions = []
for i in range(len(dataset)):
    q = dataset[i]['question'] + '\n\nAfter that, put your final answer within $\\boxed{}$.'
    questions.append((i, q))
    
len(questions)

140

In [6]:
questions[0][1]

'Lepas ujan jangan maen lari-lari, kan biyak.\n\nterjemah ke melayu baku\n\nAfter that, put your final answer within $\\boxed{}$.'

In [7]:
folder = 'openai-o3'
# !rm -rf {folder}
!mkdir {folder}

mkdir: cannot create directory ‘openai-o3’: File exists


In [11]:
def generate_answer(row, repeat = 5):
    no, q = row
    for k in range(repeat):
        filename = os.path.join(folder, f'{no}-{k}.json')
        try:
            with open(filename) as fopen:
                json.load(fopen)
            continue
        except:
            pass
        
        for _ in range(5):

            response = client.responses.create(
              model="o3-2025-04-16",
              input=[
                {
                  "role": "user",
                  "content": [
                    {
                      "type": "input_text",
                      "text": q
                    }
                  ]
                },
              ],
              text={
                "format": {
                  "type": "text"
                }
              },
              reasoning={
                "effort": "medium"
              },
            )
            
            text = response.output[1].content[0].text
            try:
                if 'boxed{' in text:
                    text = text.split('boxed{')
                    text = text[-1].split('}')[0]
                    if 'text{' in text:
                        text = text.split('text{')[1]
                with open(filename, 'w') as fopen:
                    json.dump(text, fopen)
                break
            except Exception as e:
                print(e)

In [12]:
for i in tqdm(range(len(dataset))):
    generate_answer(questions[i])

100%|███████████████████████████████████████| 140/140 [2:26:57<00:00, 62.98s/it]


In [16]:
from sacrebleu.metrics import CHRF
from glob import glob
from collections import defaultdict
import numpy as np

In [17]:
chrf = CHRF()
pairs = defaultdict(list)

for i in tqdm(range(len(dataset_lang))):
    from_lang = dataset_lang[i]['from_lang']
    to_lang = dataset_lang[i]['to_lang']
    gt = dataset_lang[i]['answer']
    pair = f'{from_lang}<>{to_lang}'
    files = glob(f'openai-o3/{i}-*.json')
    scores = []
    for f in files:
        with open(f) as fopen:
            d = json.load(fopen)
        score = chrf.corpus_score([d], [[gt]]).score
        scores.append(score)

    max_score = max(scores)
    pairs[pair].append(max_score)

100%|███████████████████████████████████████| 140/140 [00:00<00:00, 1163.45it/s]


In [22]:
for k, v in pairs.items():
    l, r = k.split('<>')
    print(f'From: {l} To: {r}, score:', np.mean(v))

From: johor To: malay, score: 60.57753659569944
From: kedah To: malay, score: 60.625255717048255
From: pahang To: malay, score: 57.43302925503334
From: negeri sembilan To: malay, score: 65.65416081624213
From: kelantan To: malay, score: 51.70250963642012
From: penang To: malay, score: 63.34775715046866
From: melaka To: malay, score: 48.90926542921075
From: malay To: johor, score: 50.22451676633022
From: malay To: kedah, score: 51.021160025382784
From: malay To: pahang, score: 46.9526364405062
From: malay To: negeri sembilan, score: 46.1334324624407
From: malay To: kelantan, score: 42.73765886368901
From: malay To: penang, score: 59.24333917229933
From: malay To: melaka, score: 55.01495010635166


In [23]:
x = """
From: malay To: johor, score: 50.22451676633022
From: malay To: kedah, score: 51.021160025382784
From: malay To: pahang, score: 46.9526364405062
From: malay To: negeri sembilan, score: 46.1334324624407
From: malay To: kelantan, score: 42.73765886368901
From: malay To: penang, score: 59.24333917229933
From: malay To: melaka, score: 55.01495010635166
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

50.18967054814284