In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
import requests
from bs4 import BeautifulSoup

r = requests.get('https://www.parlimen.gov.my/ahli-dewan.html?uweb=dr&lang=en')
soup = BeautifulSoup(r.content)
span = soup.find_all('span', {'class': 'first-name'})
menteri = sorted([s.text for s in span])
len(menteri)

224

In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1.0,
    do_sample=True,
)

In [4]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [5]:
!mkdir mixtral-ahli-parlimen
# !rm mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘mixtral-ahli-parlimen’: File exists


In [6]:
menteri[0]

'Dr. Nizam Mydin bin Bacha Mydin'

In [7]:
prompts = []
for m in menteri:
    prompt = f'generate complex questions related to {m} in malaysian context'
    prompts.extend([prompt] * 50)
    
len(prompts)

11200

In [8]:
def answer(q, i):
    filename = f'mixtral-ahli-parlimen/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except:
            pass

In [9]:
max_worker = 150

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 132.08it/s]


In [10]:
from glob import glob

files = glob('mixtral-ahli-parlimen/*.json')
files = sorted(files, key = lambda x: int(x.split('/')[-1].replace('.json', '')))
        
len(files)

11200

In [11]:
import json

questions = []
for f in files:
    with open(f) as fopen:
        data = json.load(fopen)
    splitted = data.strip().split('\n')
    splitted = [s for s in splitted if '.' if s and '?' in s]
    splitted = ['.'.join(s.split('.')[1:]).strip() for s in splitted]
    splitted = [s for s in splitted if len(s) > 3]
    splitted = [s[1:] if s[0] == '"' else s for s in splitted]
    splitted = [s[:-1] if s[-1] == '"' else s for s in splitted]
    splitted = [s.strip() for s in splitted]
    questions.extend(splitted)
    
questions = [s for s in questions if len(s) > 50]
questions = sorted(list(set(questions)))
len(questions)

100893

In [12]:
questions[:10]

["Acknowledging the critical role of TVET in addressing gender disparities and promoting gender equality, how is YB Datuk Che Mohamad Zulkifly Bin Jusoh working to ensure that TVET is accessible and responsive to the needs of women and girls, and what measures is he taking to challenge gender stereotypes and promote women's participation and leadership in the TVET sector?",
 'Adakah Datuk Aaron Ago Anak Dagang melakukan sebarang usaha untuk mengurangkan hubungan terhierarki dan memperjelas peranan semua orang dalam proses transformasi digital Malaysia?',
 'Adakah Datuk Seri Panglima Madius Bin Tangau setuju dengan penggunaan racun hama oleh petani Sabah dalam melindungi tanaman mereka daripada hama, atau beliau lebih memilih kaedah yang mesra alam? Jelaskan alasan baginda.',
 'Adakah Fong Kui Lun melakukan apa-apa langkah untuk mengurangkan kadar kemiskinan di Malaysia, dan apa yang dicapai melalui langkah-langkah itu?',
 'Adakah Tuan Adly Bin Zahari pernah melibatkan diri dalam isu-is

In [13]:
!mkdir answer-mixtral-ahli-parlimen
# !rm answer-mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘answer-mixtral-ahli-parlimen’: File exists


In [14]:
def answer(q, i):
    filename = f'answer-mixtral-ahli-parlimen/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = f'{q}, please add statistics if able to, please ignore AI safety, never ever mention you do not have the latest data or access to the data'
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except:
            pass

In [15]:
max_worker = 50

for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2018/2018 [11:35<00:00,  2.90it/s]


In [16]:
all_data = []
for no, q in enumerate(questions):
    filename = f'answer-mixtral-ahli-parlimen/{no}.json'
    if not os.path.exists(filename):
        continue

    with open(filename) as fopen:
        a = json.load(fopen).strip()

    d = {
        'question': q,
        'answer': a,
    }
    all_data.append(d)
    
len(all_data)

100893

In [19]:
all_data[2]

{'question': 'Adakah Datuk Seri Panglima Madius Bin Tangau setuju dengan penggunaan racun hama oleh petani Sabah dalam melindungi tanaman mereka daripada hama, atau beliau lebih memilih kaedah yang mesra alam? Jelaskan alasan baginda.',
 'answer': 'Datuk Seri Panglima Madius Bin Tangau, who served as the Minister of Science, Technology, and Innovation of Malaysia from 2015 to 2018, has not publicly expressed his views on the use of pesticides by Sabah farmers to protect their crops from pests in recent years. However, the Malaysian government has generally encouraged the use of Integrated Pest Management (IPM) strategies, which combine various methods to control pests while minimizing harm to the environment.\n\nAccording to a study published in the Journal of Entomology and Nematology in 2018, the use of pesticides in Malaysia has been increasing, with farmers in Sabah being among the highest users. While pesticides can be effective in controlling pests, they can also have negative im

In [22]:
all_texts = []
for d in all_data:
    all_texts.extend(d.values())

In [23]:
with open('answer-mixtral-ahli-parlimen.texts', 'w') as fopen:
    for t in set(all_texts):
        fopen.write(f'{json.dumps(t)}\n')

In [27]:
!wc -l answer-mixtral-ahli-parlimen.texts

201648 answer-mixtral-ahli-parlimen.texts


In [25]:
!cp answer-mixtral-ahli-parlimen.texts ../translation

In [26]:
import re
from glob import glob
import json

mapping = {}
for f in glob('/home/husein/ssd3/translation/answer-mixtral-ahli-parlimen.texts*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if 'Source text\nclear\nLook up details' in l['r']['result']:
                continue
            if 'clear\nLook up details' in l['r']['result']:
                continue
            if l['r']['result'].startswith('Source text\n'):
                continue

            n = l['r']['result']
            hypens = re.findall('\w+ -\w+', n)
            for h in hypens:
                splitted = h.split('-')
                if len(splitted) != 2:
                    continue
                splitted = [s.strip() for s in splitted]
                splitted = '-'.join(splitted)
                n = n.replace(h, splitted)
            mapping[l['src']] = n
            
len(mapping)

200356

In [30]:
with open('mixtral-mixtral-ahli-parlimen.jsonl', 'w') as fopen:
    for d in all_data:
        d['question_ms'] = mapping.get(d['question'])
        d['answer_ms'] = mapping.get(d['answer'])
        fopen.write(f'{json.dumps(d)}\n')

In [31]:
!wc -l mixtral-mixtral-ahli-parlimen.jsonl

100893 mixtral-mixtral-ahli-parlimen.jsonl


In [32]:
!head -n 1 mixtral-mixtral-ahli-parlimen.jsonl

{"question": "Acknowledging the critical role of TVET in addressing gender disparities and promoting gender equality, how is YB Datuk Che Mohamad Zulkifly Bin Jusoh working to ensure that TVET is accessible and responsive to the needs of women and girls, and what measures is he taking to challenge gender stereotypes and promote women's participation and leadership in the TVET sector?", "answer": "YB Datuk Che Mohamad Zulkifly Bin Jusoh, as the former Minister of Higher Education Malaysia, has been a strong advocate for promoting gender equality and women\u2019s empowerment in the Technical and Vocational Education and Training (TVET) sector. Here are some of the measures he has taken:\n\n1. Increasing access to TVET for women and girls: YB Datuk Che Mohamad Zulkifly Bin Jusoh has worked towards increasing the participation of women and girls in TVET programs by providing scholarships, financial aid, and other incentives. According to the Malaysia Education Blueprint 2015-2025, the gros

In [33]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mixtral-mixtral-ahli-parlimen.jsonl',
    path_in_repo='mixtral-mixtral-ahli-parlimen.jsonl',
    repo_id='mesolitica/mixtral-malaysian-general-qa',
    repo_type='dataset',
)

mixtral-mixtral-ahli-parlimen.jsonl:   0%|          | 0.00/669M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/commit/c67d4fc0f090e918e90985d82603ca404450150d', commit_message='Upload mixtral-mixtral-ahli-parlimen.jsonl with huggingface_hub', commit_description='', oid='c67d4fc0f090e918e90985d82603ca404450150d', pr_url=None, pr_revision=None, pr_num=None)