In [2]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [4]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [5]:
!mkdir mixtral-conversation-v3
# !rm mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘mixtral-conversation-v3’: File exists


In [7]:
prompt = 'generate a conversation with a chatbot, but the user like to ask stupid questions to the chatbot'
formatted_prompt = format_prompt(prompt, [])
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
output = stream.generated_text
output

In [6]:
def answer(q, i):
    filename = f'mixtral-conversation-v3/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except:
            pass

In [15]:
prompts = ['generate a conversation with a chatbot, but the user like to ask stupid questions to the chatbot'] * 100000

In [16]:
len(prompts)

100000

In [1]:
max_worker = 150

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

In [8]:
from glob import glob

files = glob('mixtral-conversation-v3/*.json')
files = sorted(files, key = lambda x: int(x.split('/')[-1].replace('.json', '')))
        
len(files)

60450

In [9]:
import json

questions = []
for f in files:
    with open(f) as fopen:
        data = json.load(fopen)
    questions.append(data)
    
questions = [s for s in questions if len(s) > 40]
questions = sorted(list(set(questions)))
len(questions)

60387

In [10]:
conversations, rejected = [], []
for no in tqdm(range(len(questions))):
    q = questions[no]
    conversation = []
    try:
        splitted = q.strip().split('Human:')
        if 'Chatbot:' not in splitted[-1]:
            pass
#             print(data)
#             break
        
        if 'User:' in q:
            u = 'User:'
        else:
            u = 'Human:'

        splitted = q.strip().split(u)
        for i in range(1, len(splitted), 1):
            split = splitted[i].split('\nChatbot: ')
            if len(split) == 1:
                break
            
            if len(split) > 2:
                a = '\n'.join(split[1:]).strip()
                user = split[0].strip()
                # user = user.replace('Chatbot:', 'Chatbot,')
                # print(no, splitted[i], user, a)
            else:
                user = split[0].strip()
                a = split[1].strip()
                
            conversation.extend([
                {'role': 'user', 'content': user},
                {'role': 'assistant', 'content': a}
            ])
            
    except Exception as e:
        print(e, q)
        
        pass
    
    if len(conversation):
        conversations.append(conversation)
    else:
        rejected.append(q)

100%|█████████████████████████████████| 60387/60387 [00:00<00:00, 124871.78it/s]


In [11]:
len(conversations)

60384

In [12]:
conversations[0]

[{'role': 'user',
  'content': "Hey Chatbot, what's the color of a banana in the dark?"},
 {'role': 'assistant',
  'content': "Interesting question! In the absence of light, we cannot see colors, so it's not accurate to assign a color to a banana in the dark. However, we know a banana is typically yellow when it's ripe."},
 {'role': 'user',
  'content': 'If a tree falls in a forest and no one is around to hear it, does it make a sound?'},
 {'role': 'assistant',
  'content': "This is a classic philosophical question! Sound is a form of energy that is produced when an object vibrates. So, yes, a tree falling in a forest would produce sound waves, but whether we perceive it as sound depends on whether there's someone or something there to hear it."},
 {'role': 'user', 'content': 'Can fish drown?'},
 {'role': 'assistant',
  'content': 'Fish live underwater and extract oxygen from water through their gills. They can certainly die from lack of oxygen, but we don\'t typically use the term "dr

In [11]:
all_texts = []
for c in conversations:
    for d in c:
        all_texts.append(d['content'])
        
len(all_texts)

698166

In [194]:
with open('mixtral-conversation-stupid.texts', 'w') as fopen:
    for t in set(all_texts):
        if len(t) > 3:
            fopen.write(f'{json.dumps(t)}\n')

In [14]:
len(set(all_texts))

372471

In [1]:
import re
from glob import glob
import json

mapping = {}
for f in glob('/home/husein/ssd3/translation/mixtral-conversation-stupid.texts*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if 'Source text\nclear\nLook up details' in l['r']['result']:
                continue
            if 'clear\nLook up details' in l['r']['result']:
                continue
            if l['r']['result'].startswith('Source text\n'):
                continue

            n = l['r']['result']
            hypens = re.findall('\w+ -\w+', n)
            for h in hypens:
                splitted = h.split('-')
                if len(splitted) != 2:
                    continue
                splitted = [s.strip() for s in splitted]
                splitted = '-'.join(splitted)
                n = n.replace(h, splitted)
            mapping[l['src']] = n
            
len(mapping)

369486

In [196]:
!cp mixtral-conversation-stupid.texts ../translation

In [21]:
with open('mixtral-conversation-stupid.jsonl', 'w') as fopen:
    for c in conversations:
        for i in range(len(c)):
            c[i]['content_ms'] = mapping.get(c[i]['content'])
        fopen.write(f'{json.dumps(c)}\n')

In [22]:
!head -n 1 mixtral-conversation-stupid.jsonl

[{"role": "user", "content": "Hey Chatbot, what's the color of a banana in the dark?", "content_ms": "Hai Chatbot, apakah warna pisang dalam gelap?"}, {"role": "assistant", "content": "Interesting question! In the absence of light, we cannot see colors, so it's not accurate to assign a color to a banana in the dark. However, we know a banana is typically yellow when it's ripe.", "content_ms": "Soalan yang menarik! Jika tiada cahaya, kita tidak dapat melihat warna, jadi adalah tidak tepat untuk memberikan warna kepada pisang dalam gelap. Walau bagaimanapun, kita tahu pisang biasanya berwarna kuning apabila ia masak."}, {"role": "user", "content": "If a tree falls in a forest and no one is around to hear it, does it make a sound?", "content_ms": "Jika sebatang pokok tumbang di dalam hutan dan tiada sesiapa yang mendengarnya, adakah ia mengeluarkan bunyi?"}, {"role": "assistant", "content": "This is a classic philosophical question! Sound is a form of energy that is produced when an objec

In [23]:
!wc -l mixtral-conversation-stupid.jsonl

60384 mixtral-conversation-stupid.jsonl


In [24]:
!ls -lh mixtral-conversation-stupid.jsonl

-rw-r--r-- 1 husein husein 188M Dis  27 23:14 mixtral-conversation-stupid.jsonl


In [25]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mixtral-conversation-stupid.jsonl',
    path_in_repo='mixtral-conversation-stupid.jsonl',
    repo_id='mesolitica/mixtral-malaysian-general-qa',
    repo_type='dataset',
)

mixtral-conversation-stupid.jsonl:   0%|          | 0.00/197M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/commit/a71f9407ed56456030db5f42a3ec279b446b6632', commit_message='Upload mixtral-conversation-stupid.jsonl with huggingface_hub', commit_description='', oid='a71f9407ed56456030db5f42a3ec279b446b6632', pr_url=None, pr_revision=None, pr_num=None)