In [1]:
import json
import os

In [2]:
files = {
    'sabah': 'sabah.py',
    'perak': 'perak.py',
    'pahang': 'pahang.py',
    'negeri sembilan': 'negeri_sembilan.py',
    'melaka': 'melaka.py',
    'kelantan': 'kelantan_v2.py',
    'kedah': 'kedah.py',
    'johor': 'johor.py',
}

In [3]:
!cat kedah.py

# http://siparangkarat.blogspot.com/2014/10/loghat-kedahpenang-dialect-dictionary.html
# normalized by husein

dictionary = {
    'aci ligan': ['main kejar'],
    'air acaq': ['air lumpur', 'becak', 'air longkang'],
    'awang': ['budak'],
    'awat': ['kenapa'],
    'pasepa': ['kenapa'],
    'bakiaq': ['susu basi'],
    'baloq liat': ['pemalas'],
    'balun': ['pukul'],
    'bambu': ['pukul'],
    'ban': ['tali air'],
    'bang': ['azan'],
    'bangkit tidoq': ['bangun tidur'],
    'basah lokoih': ['basah lencun'],
    'batas': ['jalan'],
    'beghemba': ['bersaing'],
    'belah': ['bahagian', 'dekat'],
    'belahak': ['sendawa'],
    'belemoih': ['kusut masai', 'kotor'],
    'kotoq': ['kotor'],
    'belen': ['bohsia', 'sundal', 'pelacur'],
    'seron': ['bohsia', 'sundal', 'pelacur'],
    'belengaih': ['kotor'],
    'belutin': ['kotor'],
    'berderemen': ['muka calar', 'calar balar'],
    'beretuh': ['tersentuh', 'terlanggar'],
    'beghetuh': ['tersentu

In [28]:
prompts = []

for k, v in files.items():
    with open(v) as fopen:
        code = fopen.read()
    
    s = f"""
```
{code}
```

above is {k} language dictionary, generate long questions related to the dictionary in malay, return in JSON [{{'question'}}]
    """.strip()
    prompts += [s] * 50

In [29]:
len(prompts)

400

In [30]:
!rm -rf language-qa
!mkdir language-qa

In [31]:
from openai import OpenAI
import requests

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

def answer(q, i):
    filename = f'language-qa/{i}.json'
    if os.path.exists(filename):
        return
    
    results = []
    openai = OpenAI(
        base_url='',
        api_key='empty',
    )
    for _ in range(1):
        
        while True:
            try:
                
                json_data = {
                    'messages': [
                        {
                            'role': 'user',
                            'content': q,
                        },
                    ],
                    'model': 'model',
                    'stop': [
                        '<|eot_id|>',
                    ],
                    'temperature': 0.9,
                    'max_tokens': 2048,
                }
                response = requests.post(
                    'https://llama-3.us.mesolitica.com/v1/chat/completions', 
                    headers=headers, json=json_data, timeout = 60 * 10)
                
                r = response.json()['choices'][0]['message']['content']
                results.append(r)
                break
            except Exception as e:
                # print(e)
                pass
                
    
    with open(filename, 'w') as fopen:
        json.dump(results, fopen)

In [32]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [33]:
urls = [(q, no) for no, q in enumerate(prompts)]

In [34]:
answer(*urls[0])

In [27]:
prompts[0]

"```\n# http://blogserius.blogspot.com/2011/11/serius-fakta-mari-belajar-bahasa-sabah.html\n# normalized by husein\n\ndictionary = {\n    'iya bah': ['iya lah'],\n    'bah, pigi la kamu dulu': ['pergilah dulu'],\n    'bah': ['baiklah'],\n    'aisbuk': ['peti sejuk'],\n    'akun': ['menyerah', 'setuju', 'mengaku'],\n    'ampai': ['letak'],\n    'taruk': ['letak'],\n    'ampus': ['lelah', 'asma'],\n    'amput': ['bersetubuh', 'berzina'],\n    'antam': ['pukul'],\n    'bantut': ['pondan', 'mak nyah', 'bapok'],\n    'pundan': ['pondan', 'mak nyah', 'bapok'],\n    'bangas': ['basi'],\n    'bengali': ['orang bangladesh'],\n    'berusil': ['berlari'],\n    'burusil': ['berlari'],\n    'bertagar': ['berkarat'],\n    'bida': ['buruk'],\n    'biut': ['senget', 'sesuatu yang tidak lurus'],\n    'bubut': ['kejar'],\n    'buyung': ['zakar', 'penis'],\n    'borot': ['zakar', 'penis'],\n    'tontolou': ['zakar', 'penis'],\n    'budu': ['bodoh'],\n    'busung': ['tulah', 'ketulahan'],\n    'celana': [

In [35]:
from threading import Thread
from queue import Queue
from tqdm import tqdm

queue = Queue()
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [36]:
max_worker = 50
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

100%|██████████████████████████████████████████████████████████████████████████████▊| 399/400 [31:48<00:04,  4.78s/it]


consumer 12 done
consumer 48 done
consumer 35 done
consumer 1 done
consumer 40 done
consumer 38 done
consumer 37 done
consumer 39 done
consumer 14 done
consumer 44 done
consumer 31 done
consumer 10 done
consumer 30 done
consumer 16 done
consumer 45 done
consumer 29 done
consumer 0 done
consumer 22 done
consumer 43 done
consumer 23 done
consumer 41 done
consumer 11 done
consumer 4 done
consumer 6 done
consumer 17 done
consumer 27 done
consumer 28 done
consumer 46 done
consumer 3 done
consumer 32 done
consumer 19 done
consumer 20 done
consumer 2 done
consumer 47 done
consumer 5 done
consumer 7 done
consumer 42 done
consumer 24 done
consumer 34 done
consumer 8 done
consumer 9 done
consumer 33 done
consumer 26 done
consumer 49 done
consumer 21 done
consumer 18 done
consumer 25 done
consumer 13 done
consumer 15 done
consumer 36 done


In [38]:
data = []

for k, v in files.items():
    with open(v) as fopen:
        code = fopen.read()
    
    s = f"""
```
{code}
```

above is {k} language dictionary, generate long questions related to the dictionary in malay, return in JSON [{{'question'}}]
    """.strip()
    data += [{'code': code, 'prompt': s}] * 50
    
len(data)

400

In [45]:
with open('generated-dialect-question.jsonl', 'w') as fopen_l:

    for i in range(len(data)):
        filename = f'language-qa/{i}.json'
        if not os.path.exists(filename):
            continue

        with open(filename) as fopen:
            d = json.load(fopen)

        data[i]['generated'] = d
        
        fopen_l.write(f'{json.dumps(data[i])}\n')

In [46]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='generated-dialect-question.jsonl',
    path_in_repo='generated-dialect-question.jsonl',
    repo_id='mesolitica/llama3-70b-language-qa',
    repo_type='dataset',
)

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/llama3-70b-language-qa/commit/14dbd8f7172823d70637377fecb08646e5f36674', commit_message='Upload generated-dialect-question.jsonl with huggingface_hub', commit_description='', oid='14dbd8f7172823d70637377fecb08646e5f36674', pr_url=None, pr_revision=None, pr_num=None)