In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "https://mixtral.us-west-2.mesolitica.com", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [5]:
files = [
    'maktabahalbakri.com.jsonl',
    'muftiwp.gov.my.jsonl',
    'asklegal.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'gov.my.jsonl',
    'patriots.jsonl',
    'rootofscience.jsonl',
    'majalahsains.jsonl',
    'nasilemaktech.com.jsonl',
    'alhijrahnews-articles.jsonl',
]

In [6]:
def partition(text, size = 2000):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size) if len(splitted[i: i + size]) > 500]

In [7]:
import math

all_texts = []
for f in files:
    texts = []
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if isinstance(l, dict):
                l = l['body']
            if len(l.split()) < 150:
                continue
            texts.append(l)
    if len(texts) > 10000:
        factor = 1
    else:
        factor = 3
    print(f, len(texts), factor)
    texts = texts * factor
    all_texts.extend(texts)
len(all_texts)

maktabahalbakri.com.jsonl 3365 3
muftiwp.gov.my.jsonl 3669 3
asklegal.jsonl 1231 3
dewanbahasa-jdbp.jsonl 4154 3
gov.my.jsonl 21097 1
patriots.jsonl 10420 1
rootofscience.jsonl 1138 3
majalahsains.jsonl 1471 3
nasilemaktech.com.jsonl 4376 3
alhijrahnews-articles.jsonl 7838 3


113243

In [8]:
with open('wikipedia-2023-10-01.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        for p in partition(l):
            all_texts.append(p)
len(all_texts)

131001

In [9]:
prompts = []
for t in all_texts:
    prompt = f'{t}\n\ngenerate factual wrong and confusing questions ONLY to trick people based on context above'
    prompts.extend([(t, prompt)] * 1)
    
len(prompts)

131001

In [10]:
prompts[0]

('-33%\n\n\n\n\n \n\nSiri Tasawuf\nAhasin Al Kalim Syarah Hikam Jilid 2\n\n\nRM\n30.00\n \nRM\n20.00\nTambah Troli\n\n\n\n\n \n\neBook Siri umum\n[eBook] Inilah Doaku\n\n\nRM\n5.00\nTambah Troli\n\n\n\n\n \n\nSiri Umum\n[eBook] Kalimah Allah Edisi Kemaskini\n\n\nRM\n10.00\nTambah Troli\n\n\n\n\n \n\nBuku Fizikal\nKeajaiban Sedekah\n\n\nRM\n25.00\nTambah Troli\n\n\n\n\n \n\nEbook\neBook Mereka yang Pemurah\nRated \n5.00\n out of 5\n\n\nRM\n10.00\nTambah Troli\n-33%\n\n\n\n\n \n\neBook Sirah\n[eBook] – Garis Masa Hayat Al Rasul SAW Edisi Pertama\n\n\nRM\n30.00\n \nRM\n20.00\nTambah Troli\n-20%\n\n\n\n\n \n\nBuku Fizikal\nPanduan Lengkap Ke Arah Kesempurnaan Ibadah Haji dan Umrah\n\n\nRM\n100.00\n \nRM\n80.00\nTambah Troli\n\n\n\n\n \n\nKajian Ilmiah\nSahabat Nabi SAW yang bernama Tha’labah\n\n\nRM\n5.00\nTambah Troli\nSoalan\nApakah dalil daripada sunnah yang menunjukkan Rasulullah SAW selalu berbekam?\nJawapan\nAlhamdulillah, puji dan syukur kepada Ilahi atas pelbagai kurniaan dan anuge

In [11]:
!mkdir mixtral-rag-question-factually-wrong
# !rm mixtral-rag-question-factually-wrong/*.json

mkdir: cannot create directory ‘mixtral-rag-question-factually-wrong’: File exists


In [12]:
def answer(q, i):
    filename = f'mixtral-rag-question-factually-wrong/{i}.json'
    if os.path.exists(filename):
        return
    
    
    for _ in range(3):
        try:
            prompt = q[1]
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            splitted = output.split('\n')
            splitted = [s for s in splitted if len(s) > 3]
            
            if len(splitted) < 4:
                continue
                
            with open(filename, 'w') as fopen:
                json.dump((q[0], output), fopen)
            break
        except Exception as e:
            # print(e)
            pass

In [13]:
answer(prompts[1], 1)

In [14]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [15]:
urls = [(q, no) for no, q in enumerate(prompts)]

In [16]:
from threading import Thread
from queue import Queue

queue = Queue()
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [17]:
max_worker = 256
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

 88%|███████████████████████████████████████████████████████████████████████████████████████████▌            | 115385/131001 [7:34:44<4:07:09,  1.05it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
for i in range(len(consumers)):
    consumers[i].join()

consumer 163 done
consumer 78 done
consumer 19 done
consumer 207 done
consumer 34 done
consumer 32 done
consumer 233 done
consumer 61 done
consumer 121 done
consumer 75 done
consumer 58 done
consumer 45 done
consumer 196 done
consumer 114 done
consumer 92 done
consumer 99 done
consumer 200 done
consumer 40 done
consumer 70 done
consumer 89 done
consumer 51 done
consumer 172 done
consumer 102 done
consumer 128 done
consumer 210 done
consumer 138 done
consumer 140 done
consumer 28 done
consumer 93 done
consumer 57 done
consumer 243 done
consumer 95 done
consumer 59 done
consumer 101 done
consumer 22 done
consumer 4 done
consumer 156 done
consumer 68 done
consumer 171 done
consumer 247 done
consumer 86 done
consumer 129 done
consumer 124 done
consumer 204 done
consumer 154 done
consumer 130 done
consumer 97 done
consumer 238 done
consumer 227 done
consumer 219 done
consumer 142 done
consumer 36 done
consumer 85 done
consumer 174 done
consumer 74 done
consumer 165 done
consumer 6 done
cons