In [28]:
import requests
import anthropic
import os
import json
import re
from tqdm import tqdm
from datasets import load_dataset

api_key = ''

In [2]:
dataset = load_dataset('huseinzol05/malaysian-dialect-qa', split = 'test')

In [30]:
dataset_lang = load_dataset('huseinzol05/malaysian-dialect-qa-lang', split = 'test')

In [3]:
questions = []
for i in range(len(dataset)):
    q = dataset[i]['question'] + '\n\nAfter that, put your final answer within $\\boxed{}$.'
    questions.append((i, q))
    
len(questions)

140

In [4]:
questions[0]

(0,
 'Lepas ujan jangan maen lari-lari, kan biyak.\n\nterjemah ke melayu baku\n\nAfter that, put your final answer within $\\boxed{}$.')

In [9]:
folder = 'antrophic-sonnet4-reasoning'
!rm -rf {folder}
!mkdir {folder}

In [24]:
def generate_answer(row, repeat = 5, thinking = False):
    no, q = row
    if thinking:
        thinking_mode = {
            "type": "enabled",
            "budget_tokens": 6000
        }
    else:
        thinking_mode = {
            "type": "disabled",
        }
    for k in range(repeat):
        filename = os.path.join(folder, f'{no}-{k}.json')
        if os.path.exists(filename):
            continue
            
        client = anthropic.Anthropic(
            api_key=api_key,
        )
        
        for _ in range(5):

            message = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8192,
                temperature=1,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": q
                            }
                        ]
                    }
                ],
                thinking=thinking_mode
            )
            
            if thinking:
                text = message.content[1].text
            else:
                text = message.content[0].text
            try:
                if 'boxed{' in text:
                    text = text.split('boxed{')
                    text = text[-1].split('}')[0]
                    if 'text{' in text:
                        text = text.split('text{')[1]
                with open(filename, 'w') as fopen:
                    json.dump(text, fopen)
                break
            except Exception as e:
                print(e)

In [29]:
for i in tqdm(range(len(dataset))):
    generate_answer(questions[i], thinking = True)

100%|███████████████████████████████████████| 140/140 [2:03:49<00:00, 53.07s/it]


In [31]:
from sacrebleu.metrics import CHRF
from glob import glob
from collections import defaultdict
import numpy as np

In [32]:
chrf = CHRF()
pairs = defaultdict(list)

for i in tqdm(range(len(dataset_lang))):
    from_lang = dataset_lang[i]['from_lang']
    to_lang = dataset_lang[i]['to_lang']
    gt = dataset_lang[i]['answer']
    pair = f'{from_lang}<>{to_lang}'
    files = glob(f'antrophic-sonnet4-reasoning/{i}-*.json')
    scores = []
    for f in files:
        with open(f) as fopen:
            d = json.load(fopen)
        score = chrf.corpus_score([d], [[gt]]).score
        scores.append(score)

    max_score = max(scores)
    pairs[pair].append(max_score)

100%|███████████████████████████████████████| 140/140 [00:00<00:00, 1075.11it/s]


In [33]:
for k, v in pairs.items():
    l, r = k.split('<>')
    print(f'From: {l} To: {r}, score:', np.mean(v))

From: johor To: malay, score: 54.255257504569634
From: kedah To: malay, score: 57.449596027901556
From: pahang To: malay, score: 56.050299072477785
From: negeri sembilan To: malay, score: 54.09057071374
From: kelantan To: malay, score: 45.3943144521222
From: penang To: malay, score: 64.80883099915431
From: melaka To: malay, score: 45.44978279611746
From: malay To: johor, score: 47.94515660238058
From: malay To: kedah, score: 45.27348894658902
From: malay To: pahang, score: 47.98090699950979
From: malay To: negeri sembilan, score: 48.028256484596945
From: malay To: kelantan, score: 32.13859674498915
From: malay To: penang, score: 41.04876819683223
From: malay To: melaka, score: 52.83520776382694


In [34]:
x = """
From: johor To: malay, score: 54.255257504569634
From: kedah To: malay, score: 57.449596027901556
From: pahang To: malay, score: 56.050299072477785
From: negeri sembilan To: malay, score: 54.09057071374
From: kelantan To: malay, score: 45.3943144521222
From: penang To: malay, score: 64.80883099915431
From: melaka To: malay, score: 45.44978279611746
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

53.9283787951547

In [35]:
x = """
From: malay To: johor, score: 47.94515660238058
From: malay To: kedah, score: 45.27348894658902
From: malay To: pahang, score: 47.98090699950979
From: malay To: negeri sembilan, score: 48.028256484596945
From: malay To: kelantan, score: 32.13859674498915
From: malay To: penang, score: 41.04876819683223
From: malay To: melaka, score: 52.83520776382694
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

45.035768819817804