In [1]:
# CUDA_VISIBLE_DEVICES="2,3" vllm/bin/vllm serve "mesolitica/Malaysian-Qwen2.5-7B-Dialect-Reasoning-GRPO" --port 8006 --tensor_parallel_size=2

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset('huseinzol05/malaysian-dialect-qa', split = 'test')



In [4]:
dataset_lang = load_dataset('huseinzol05/malaysian-dialect-qa-lang', split = 'test')

In [5]:
questions = []
for i in range(len(dataset)):
    q = dataset[i]['question']
    questions.append((i, q))
    
len(questions)

140

In [6]:
folder = 'Malaysian-Qwen2.5-7B-Reasoning-GRPO-fp16'
# !rm -rf {folder}
!mkdir {folder}

In [8]:
import requests
import os
import json
import re

def generate_answer(row, repeat = 5):
    no, q = row
    for k in range(repeat):
        filename = os.path.join(folder, f'{no}-{k}.json')
        try:
            with open(filename) as fopen:
                json.load(fopen)
            continue
        except:
            pass

        json_data = {
            'model': "mesolitica/Malaysian-Qwen2.5-7B-Dialect-Reasoning-GRPO",
            'messages': [
                {'role': 'system', 'content': 'You are going to enter reasoning mode. First, you try to think step-by-step in Malay. After that, put your final answer within $\\boxed{}$.'},
                {'role': 'user', 'content': q},
            ],
            'max_tokens': 24000,
        }
        
        while True:
            response = requests.post('http://localhost:8006/v1/chat/completions', json=json_data)
            r = response.json()['choices'][0]['message']['content'].strip()
            answers = re.findall(r"\$boxed\{(.*?)\}\$", r)
            if len(answers) == 1:
                a = answers[0]
                with open(filename, 'w') as fopen:
                    json.dump(a, fopen)
                    break

In [9]:
generate_answer(questions[0])

In [10]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        generate_answer(item)
    print(f'consumer {name} done')

In [11]:
from threading import Thread
from queue import Queue

queue = Queue()
for u in questions:
    queue.put(u)
    
ori_size = queue.qsize()

In [12]:
from tqdm import tqdm

max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 139/140 [02:32<00:01,  1.10s/it]


consumer 29 done
consumer 48 done
consumer 35 done
consumer 0 done
consumer 6 done
consumer 36 done
consumer 9 done
consumer 15 done
consumer 33 done
consumer 34 done
consumer 22 done
consumer 32 done
consumer 37 done
consumer 26 done
consumer 1 done
consumer 42 done
consumer 47 done
consumer 31 done
consumer 19 done
consumer 21 done
consumer 3 done
consumer 41 done
consumer 23 done
consumer 44 done
consumer 49 done
consumer 17 done
consumer 20 done
consumer 30 done
consumer 13 done
consumer 25 done
consumer 16 done
consumer 46 done
consumer 45 done
consumer 24 done
consumer 40 done
consumer 7 done
consumer 10 done
consumer 2 done
consumer 11 done
consumer 12 done
consumer 5 done
consumer 39 done
consumer 4 done
consumer 28 done
consumer 43 done
consumer 38 done
consumer 18 done
consumer 14 done
consumer 27 done
consumer 8 done


In [13]:
from sacrebleu.metrics import CHRF
from glob import glob
from collections import defaultdict
import numpy as np

In [14]:
chrf = CHRF()
pairs = defaultdict(list)

for i in tqdm(range(len(dataset_lang))):
    from_lang = dataset_lang[i]['from_lang']
    to_lang = dataset_lang[i]['to_lang']
    gt = dataset_lang[i]['answer']
    pair = f'{from_lang}<>{to_lang}'
    files = glob(f'Malaysian-Qwen2.5-7B-Reasoning-GRPO-fp16/{i}-*.json')
    if len(files) < 5:
        print(i)
    scores = []
    for f in files:
        with open(f) as fopen:
            d = json.load(fopen)
        score = chrf.corpus_score([d], [[gt]]).score
        scores.append(score)

    max_score = max(scores)
    pairs[pair].append(max_score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 721.17it/s]


In [15]:
for k, v in pairs.items():
    l, r = k.split('<>')
    print(f'From: {l} To: {r}, score:', np.mean(v))

From: johor To: malay, score: 57.42949426937456
From: kedah To: malay, score: 58.12580212528728
From: pahang To: malay, score: 55.60484906845884
From: negeri sembilan To: malay, score: 56.4509629484568
From: kelantan To: malay, score: 53.944979416369996
From: penang To: malay, score: 62.20935643642939
From: melaka To: malay, score: 57.14492955494046
From: malay To: johor, score: 55.68356840259747
From: malay To: kedah, score: 56.264707994950186
From: malay To: pahang, score: 60.15982036912563
From: malay To: negeri sembilan, score: 48.71725827604103
From: malay To: kelantan, score: 43.948995049469474
From: malay To: penang, score: 63.15864675162173
From: malay To: melaka, score: 74.12398375006538


In [16]:
x = """
From: johor To: malay, score: 57.42949426937456
From: kedah To: malay, score: 58.12580212528728
From: pahang To: malay, score: 55.60484906845884
From: negeri sembilan To: malay, score: 56.4509629484568
From: kelantan To: malay, score: 53.944979416369996
From: penang To: malay, score: 62.20935643642939
From: melaka To: malay, score: 57.14492955494046
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

57.27291054561676

In [18]:
x = """
From: malay To: johor, score: 55.68356840259747
From: malay To: kedah, score: 56.264707994950186
From: malay To: pahang, score: 60.15982036912563
From: malay To: negeri sembilan, score: 48.71725827604103
From: malay To: kelantan, score: 43.948995049469474
From: malay To: penang, score: 63.15864675162173
From: malay To: melaka, score: 74.12398375006538
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

57.436711513410124