In [None]:
# CUDA_VISIBLE_DEVICES="4,5,6" vllm/bin/vllm serve "mesolitica/Malaysian-Qwen2.5-7B-Dialect-Reasoning-GRPO" --dtype float32 --port 8007 --tensor_parallel_size=3

In [1]:
from datasets import load_dataset

In [4]:
dataset = load_dataset('huseinzol05/malaysian-dialect-qa', split = 'test')

In [2]:
dataset_lang = load_dataset('huseinzol05/malaysian-dialect-qa-lang', split = 'test')



In [5]:
questions = []
for i in range(len(dataset)):
    q = dataset[i]['question']
    questions.append((i, q))
    
len(questions)

140

In [9]:
folder = 'Malaysian-Qwen2.5-7B-Reasoning-GRPO-fp32'
# !rm -rf {folder}
!mkdir {folder}

mkdir: cannot create directory ‘Malaysian-Qwen2.5-7B-Reasoning-GRPO-fp32’: File exists


In [10]:
import requests
import os
import json
import re

def generate_answer(row, repeat = 5):
    no, q = row
    for k in range(repeat):
        filename = os.path.join(folder, f'{no}-{k}.json')
        try:
            with open(filename) as fopen:
                json.load(fopen)
            continue
        except:
            pass

        json_data = {
            'model': "mesolitica/Malaysian-Qwen2.5-7B-Dialect-Reasoning-GRPO",
            'messages': [
                {'role': 'system', 'content': 'You are going to enter reasoning mode. First, you try to think step-by-step in Malay. After that, put your final answer within $\\boxed{}$.'},
                {'role': 'user', 'content': q},
            ],
            'max_tokens': 24000,
        }
        
        while True:
            response = requests.post('http://localhost:8007/v1/chat/completions', json=json_data)
            r = response.json()['choices'][0]['message']['content'].strip()
            answers = re.findall(r"\$boxed\{(.*?)\}\$", r)
            if len(answers) == 1:
                a = answers[0]
                with open(filename, 'w') as fopen:
                    json.dump(a, fopen)
                    break

In [8]:
generate_answer(questions[0])

In [11]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        generate_answer(item)
    print(f'consumer {name} done')

In [12]:
from threading import Thread
from queue import Queue

queue = Queue()
for u in questions:
    queue.put(u)
    
ori_size = queue.qsize()

In [13]:
from tqdm import tqdm

max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 139/140 [06:36<00:02,  2.86s/it]


consumer 37 done
consumer 45 done


In [15]:
from sacrebleu.metrics import CHRF
from glob import glob
from collections import defaultdict
import numpy as np

consumer 15 done
consumer 42 done


In [48]:
chrf = CHRF()
pairs = defaultdict(list)

for i in tqdm(range(len(dataset_lang))):
    from_lang = dataset_lang[i]['from_lang']
    to_lang = dataset_lang[i]['to_lang']
    gt = dataset_lang[i]['answer']
    pair = f'{from_lang}<>{to_lang}'
    files = glob(f'Malaysian-Qwen2.5-7B-Reasoning-GRPO-fp32/{i}-*.json')
    if len(files) < 5:
        print(i, len(files))
    scores = []
    for f in files:
        with open(f) as fopen:
            d = json.load(fopen)
        score = chrf.corpus_score([d], [[gt]]).score
        scores.append(score)

    max_score = max(scores)
    pairs[pair].append(max_score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 140/140 [00:00<00:00, 782.52it/s]


In [49]:
for k, v in pairs.items():
    l, r = k.split('<>')
    print(f'From: {l} To: {r}, score:', np.mean(v))

From: johor To: malay, score: 58.2189619529139
From: kedah To: malay, score: 59.21260384746205
From: pahang To: malay, score: 53.506270589822165
From: negeri sembilan To: malay, score: 56.94870448682657
From: kelantan To: malay, score: 50.64768195652429
From: penang To: malay, score: 62.964413639258034
From: melaka To: malay, score: 56.24541676643081
From: malay To: johor, score: 54.83246740931249
From: malay To: kedah, score: 59.069394967356274
From: malay To: pahang, score: 59.695207458023745
From: malay To: negeri sembilan, score: 50.69885056697714
From: malay To: kelantan, score: 44.66310165425512
From: malay To: penang, score: 65.39795752468879
From: malay To: melaka, score: 72.39183991789344


In [52]:
x = """
From: johor To: malay, score: 58.2189619529139
From: kedah To: malay, score: 59.21260384746205
From: pahang To: malay, score: 53.506270589822165
From: negeri sembilan To: malay, score: 56.94870448682657
From: kelantan To: malay, score: 50.64768195652429
From: penang To: malay, score: 62.964413639258034
From: melaka To: malay, score: 56.24541676643081
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

56.82057903417684

In [53]:
x = """
From: malay To: johor, score: 54.83246740931249
From: malay To: kedah, score: 59.069394967356274
From: malay To: pahang, score: 59.695207458023745
From: malay To: negeri sembilan, score: 50.69885056697714
From: malay To: kelantan, score: 44.66310165425512
From: malay To: penang, score: 65.39795752468879
From: malay To: melaka, score: 72.39183991789344
"""

scores = []
for l in x.split('\n'):
    if 'score:' not in l:
        continue
    
    scores.append(float(l.split('score: ')[1]))
    
np.mean(scores)

58.10697421407243