In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [4]:
!mkdir mixtral-conversation-disagree
# !rm mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘mixtral-conversation-disagree’: File exists


In [5]:
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/amanz.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/asklegal.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/autobuzz.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/bidadari.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/blog.limkitsiang.com.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/blogtipskerjaya.net.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/buro247.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/doctoroncall.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/ecentral.my.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/gov.my.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/majalahsains.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/majalahpama.my.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/mygameon.my.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/nasilemaktech.com.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/pokde.net.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/patriots.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/ringgitohringgit.com.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/rootofscience.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/sukanz.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/vocket.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/alhijrahnews-articles.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/dewanbahasa-jdbp.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/e-penerbitan.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/fiksyenshasha.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/edu.my.jsonl
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/hansard.jsonl

In [6]:
files = [
    'amanz.jsonl',
    'asklegal.jsonl',
    'autobuzz.jsonl',
    'bidadari.jsonl',
    'blog.limkitsiang.com.jsonl',
    'blogtipskerjaya.net.jsonl',
    'buro247.jsonl',
    'doctoroncall.jsonl',
    'ecentral.my.jsonl',
    'gov.my.jsonl',
    'majalahsains.jsonl',
    'majalahpama.my.jsonl',
    'mygameon.my.jsonl',
    'nasilemaktech.com.jsonl',
    'pokde.net.jsonl',
    'patriots.jsonl',
    'ringgitohringgit.com.jsonl',
    'rootofscience.jsonl',
    'sukanz.jsonl',
    'vocket.jsonl',
    'alhijrahnews-articles.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'e-penerbitan.jsonl',
    'fiksyenshasha.jsonl',
    'edu.my.jsonl',
    'hansard.jsonl'
]

In [7]:
texts = []
for f in files:
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if len(l.split()) < 150:
                continue
            texts.append(l)
len(texts)

379790

In [8]:
prompts = []
for t in texts:
    prompt = f'{t}\n\ngenerate questions based on context above'
    prompts.extend([(t, prompt)] * 1)
    
len(prompts)

379790

In [9]:
!mkdir mixtral-disagree-conversation

mkdir: cannot create directory ‘mixtral-disagree-conversation’: File exists


In [10]:
def answer(q, i):
    filename = f'mixtral-disagree-conversation/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q[1]
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump((q[0], output), fopen)
            break
        except Exception as e:
            print(e)
            pass

In [None]:
max_worker = 50

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

 12%|████████████▎                                                                                           | 897/7596 [17:55<75:07:18, 40.37s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 13%|█████████████▏                                                                                          | 963/7596 [45:31<44:14:54, 24.02s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 13%|█████████████▏                                                                                          | 964/7596 [46:51<75:24:52, 40.94s/it]

(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 7008219e-8fa2-45e6-a4bc-1915e0695bf6)')


 13%|█████████████▏                                                                                          | 967/7596 [48:14<65:17:32, 35.46s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 14%|█████████████▉                                                                                       | 1045/7596 [1:17:35<50:16:50, 27.63s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 14%|█████████████▉                                                                                       | 1047/7596 [1:19:16<64:52:40, 35.66s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 14%|██████████████▏                                                                                      | 1071/7596 [1:29:01<38:03:31, 21.00s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 14%|██████████████▎                                                                                      | 1072/7596 [1:30:27<73:31:25, 40.57s/it]

(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 00931e72-74db-4e49-88a6-c882012c579b)')
(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: fa199cd1-7b31-44ec-8d95-9cecc6847d50)')


 19%|███████████████████▎                                                                                 | 1457/7596 [3:39:20<27:26:28, 16.09s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 25%|█████████████████████████▏                                                                           | 1896/7596 [6:05:06<31:01:03, 19.59s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


 26%|█████████████████████████▊                                                                           | 1942/7596 [6:21:57<26:43:58, 17.02s/it]