In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json
import random



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=2048,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [4]:
disagree = [
    'bullshit', 'u are lying', 'tipu la ko', 'penipu', 'tipu lah sial', 'ko tipu', 'aku tak caye',
    'wtf tipu', 'ye ke ko ni, tipu lah', 'tipu sial', 'bodoh salah', 'tak betul la sial',
    'tak betul', 'salah', 'ko pasti ke ni', 'tipu tipu tipu',
]

with open('disagree.json') as fopen:
    disagree.extend(json.load(fopen))
    
with open('doubt.json') as fopen:
    disagree.extend(json.load(fopen))
    
disagree = [d for d in disagree if len(d) > 5]
len(disagree)

11255

In [5]:
files = ['/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-kementerian.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-part2.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general-negeri.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-glc.jsonl',
 '/home/husein/ssd3/soalan-augmentation/mixtral-malaysian-general.jsonl']

In [6]:
skip = [
    'always better to find peaceful',
    'non-violent ways to express',
    'can lead to severe consequences',
    'still have questions or concerns about',
    'that doing so is illegal',
    'to report the incident to',
    'would recommend consulting with',
    'indonesian',
    'translates to',
    # 'language model'
]

instructions = []
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l_lower = l.lower()
            l = json.loads(l)
            if any([s in l_lower for s in skip]):
                # print(l)
                continue
            
            q = l.get('question', '').strip()
            if not len(q):
                continue
            if not q.endswith('?'):
                continue
            if len(q) < 200 and '?' not in q:
                # print(q)
                continue
            a = l.get('answer_ms')
            if a is None:
                continue
            a = a.strip()
            if not len(a):
                continue
            if len(a) < len(q):
                continue
            if len(a) < 100:
                continue
                
            instructions.append((q, a))
            
len(instructions)

207117

In [7]:
!mkdir mixtral-malaysian-disagree

mkdir: cannot create directory ‘mixtral-malaysian-disagree’: File exists


In [8]:
def answer(q, i):
    filename = f'mixtral-malaysian-disagree/{i}.json'
    if os.path.exists(filename):
        return
    
    history = [(q[0], q[1])]

    for _ in range(random.randint(1, 3)):
        u = random.choice(disagree)
        formatted_prompt = format_prompt(u, history)
        while True:
            try:
                stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
                a = stream.generated_text.strip()
                break
            except Exception as e:
                pass
            
        history.append((u, a))
    
    with open(filename, 'w') as fopen:
        json.dump(history, fopen)

In [9]:
instructions[100000]

('Apakah keberatan KLCC Properties Berhad terhadap pengenalan cukai jualan baru oleh kerajaan Malaysia dan bagaimana ia akan mempengaruhi keuntungan dan harga sewa ruangan di Kompleks KLCC?',
 'KLCC Properties Berhad mungkin tidak gelisah terhadap pengenalan cukai jualan baharu oleh kerajaan Malaysia, mungkinkah ia mengancam hasil dan harga sewa bilik di Kompleks KLCC. Walau bagaimanapun, seperti yang dilaporkan dalam media, mereka mungkin mempertimbangkan kesan negatif yang boleh ditimbulkan oleh eksais baharu itu.\n\nPengenalan cukai jualan baharu itu mungkin menjejaskan keuntungan KLCC Properties Berhad dengan meningkatkan kos operasi mereka, yang seterusnya boleh menjejaskan harga sewa bilik di Kompleks KLCC. Ini boleh menyebabkan penurunan dalam permintaan atau sewa untuk ruang, terutamanya jika penyewa atau pembeli baharu tidak bersedia untuk menanggung kos yang lebih tinggi.\n\nDalam situasi ini, KLCC Properties Berhad mungkin mempertimbangkan beberapa langkah untuk mengurangkan

In [10]:
answer(instructions[0], 0)

In [None]:
max_worker = 250

questions = instructions
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

In [12]:
from glob import glob

files = sorted(glob('mixtral-malaysian-disagree/*.json'))
len(files)

204262

In [20]:
all_texts = []
for f in tqdm(files):
    with open(f) as fopen:
        data = json.load(fopen)
    for d in data[1:]:
        all_texts.extend(d)
    
len(all_texts)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 204262/204262 [00:02<00:00, 72869.76it/s]


817124

In [21]:
with open('mixtral-malaysian-disagree.texts', 'w') as fopen:
    for t in set(all_texts):
        fopen.write(f'{json.dumps(t)}\n')

In [22]:
!wc -l mixtral-malaysian-disagree.texts

419572 mixtral-malaysian-disagree.texts


In [23]:
!cp mixtral-malaysian-disagree.texts ../translation

In [27]:
import re

mapping = {}
for f in glob('/home/husein/ssd3/translation/mixtral-malaysian-disagree.texts*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if 'Source text\nclear\nLook up details' in l['r']['result']:
                continue
            if 'clear\nLook up details' in l['r']['result']:
                continue
            if l['r']['result'].startswith('Source text\n'):
                continue

            n = l['r']['result']
            hypens = re.findall('\w+ -\w+', n)
            for h in hypens:
                splitted = h.split('-')
                if len(splitted) != 2:
                    continue
                splitted = [s.strip() for s in splitted]
                splitted = '-'.join(splitted)
                n = n.replace(h, splitted)
            mapping[l['src']] = n
            
len(mapping)

419364

In [28]:
files[0]

'mixtral-malaysian-disagree/0.json'

In [None]:
with open('mixtral-malaysian-disagree.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        with open(f) as fopen:
            data = json.load(fopen)
        chat = [
            {'role': 'user', 'content_ms': data[0][0]},
            {'role': 'assistant', 'content_ms': data[0][1]}
        ]
        for d in data[1:]:
            chat.extend([
                {'role': 'user', 'content': d[0], 'content_ms': mapping.get(d[0])},
                {'role': 'assistant', 'content': d[1], 'content_ms': mapping.get(d[1])}
            ])
        fopen_l.write(f'{json.dumps(chat)}\n')

In [39]:
!wc -l mixtral-malaysian-disagree.jsonl

204262 mixtral-malaysian-disagree.jsonl


In [40]:
!ls -lh mixtral-malaysian-disagree.jsonl

-rw-r--r-- 1 husein husein 1.7G Jan   8 22:19 mixtral-malaysian-disagree.jsonl


In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mixtral-malaysian-disagree.jsonl',
    path_in_repo='mixtral-malaysian-disagree.jsonl',
    repo_id='mesolitica/mixtral-malaysian-general-qa',
    repo_type='dataset',
)

mixtral-malaysian-disagree.jsonl:   0%|          | 0.00/1.72G [00:00<?, ?B/s]