In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [4]:
!mkdir mixtral-math-conversation
# !rm mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘mixtral-math-conversation’: File exists


In [None]:
prompt = 'generate a math conversation with a chatbot, but the user like to ask stupid math questions to the chatbot'
formatted_prompt = format_prompt(prompt, [])
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
output = stream.generated_text
output

In [None]:
def answer(q, i):
    filename = f'mixtral-math-conversation/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except:
            pass

In [None]:
prompts = ['generate a math conversation with a chatbot, but the user like to ask stupid math questions to the chatbot'] * 50000

In [None]:
len(prompts)

In [None]:
max_worker = 240

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

In [5]:
from glob import glob

files = glob('mixtral-math-conversation/*.json')
files = sorted(files, key = lambda x: int(x.split('/')[-1].replace('.json', '')))
        
len(files)

50000

In [6]:
import json

questions = []
for f in files:
    with open(f) as fopen:
        data = json.load(fopen)
    questions.append(data)
    
questions = [s for s in questions if len(s) > 40]
questions = sorted(list(set(questions)))
len(questions)

48186

In [7]:
conversations, rejected = [], []
for no in tqdm(range(len(questions))):
    q = questions[no]
    conversation = []
    try:
        splitted = q.strip().split('Human:')
        if 'Chatbot:' not in splitted[-1]:
            pass
#             print(data)
#             break
        
        if 'User:' in q:
            u = 'User:'
        else:
            u = 'Human:'

        splitted = q.strip().split(u)
        for i in range(1, len(splitted), 1):
            split = splitted[i].split('\nChatbot: ')
            if len(split) == 1:
                break
            
            if len(split) > 2:
                a = '\n'.join(split[1:]).strip()
                user = split[0].strip()
                # user = user.replace('Chatbot:', 'Chatbot,')
                # print(no, splitted[i], user, a)
            else:
                user = split[0].strip()
                a = split[1].strip()
                
            conversation.extend([
                {'role': 'user', 'content': user},
                {'role': 'assistant', 'content': a}
            ])
            
    except Exception as e:
        print(e, q)
        
        pass
    
    if len(conversation):
        conversations.append(conversation)
    else:
        rejected.append(q)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 48186/48186 [00:00<00:00, 131948.60it/s]


In [8]:
len(conversations)

48163

In [9]:
conversations[5]

[{'role': 'user', 'content': 'Hello chatbot, what does two plus two equal?'},
 {'role': 'assistant',
  'content': 'Hello! The answer to your question is four. Two plus two is a basic arithmetic operation that always equals four. Is there anything else you would like to know about mathematics?'},
 {'role': 'user',
  'content': "Oh, I don't know. What's the square root of 16?"},
 {'role': 'assistant',
  'content': 'No problem at all! The square root of 16 is 4. This is because 4 multiplied by 4 equals 16, which is the number we are taking the square root of.'},
 {'role': 'user', 'content': 'Alright, how about the square root of -16?'},
 {'role': 'assistant',
  'content': "That's an interesting question! The square root of -16 is an imaginary number, and it's equal to 4i. This is because when you multiply -4i by -4i, you get 16, which is the number we are taking the square root of. Imaginary numbers are a fundamental part of complex numbers, which extend the real numbers by adding an imag

In [None]:
all_texts = []
for c in conversations:
    for d in c:
        all_texts.append(d['content'])
        
len(all_texts)

In [None]:
with open('mixtral-conversation-math-stupid.texts', 'w') as fopen:
    for t in set(all_texts):
        if len(t) > 3:
            fopen.write(f'{json.dumps(t)}\n')

In [None]:
len(set(all_texts))

In [None]:
!cp mixtral-conversation-math-stupid.texts ../translation

In [10]:
import re
from glob import glob
import json

mapping = {}
for f in glob('/home/husein/ssd3/translation/mixtral-conversation-math-stupid.texts*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if 'Source text\nclear\nLook up details' in l['r']['result']:
                continue
            if 'clear\nLook up details' in l['r']['result']:
                continue
            if l['r']['result'].startswith('Source text\n'):
                continue

            n = l['r']['result']
            hypens = re.findall('\w+ -\w+', n)
            for h in hypens:
                splitted = h.split('-')
                if len(splitted) != 2:
                    continue
                splitted = [s.strip() for s in splitted]
                splitted = '-'.join(splitted)
                n = n.replace(h, splitted)
            mapping[l['src']] = n
            
len(mapping)

256852

In [11]:
with open('mixtral-conversation-math-stupid.jsonl', 'w') as fopen:
    for c in conversations:
        for i in range(len(c)):
            c[i]['content_ms'] = mapping.get(c[i]['content'])
        fopen.write(f'{json.dumps(c)}\n')

In [12]:
!head -n 1 mixtral-conversation-math-stupid.jsonl

[{"role": "user", "content": "Hello chatbot, what does 2 + 2 equal?", "content_ms": "Hello chatbot, apakah 2 + 2 sama?"}, {"role": "assistant", "content": "Hello! The value of 2 + 2 is 4.", "content_ms": "hello! Nilai 2 + 2 ialah 4."}, {"role": "user", "content": "What about 5 - 3?", "content_ms": "Bagaimana dengan 5 - 3?"}, {"role": "assistant", "content": "The value of 5 - 3 is 2.", "content_ms": "Saya sudah tahu! Saya hanya menguji awak. Bagaimana pula dengan punca kuasa dua bagi-1?"}, {"role": "user", "content": "Let's try something ridiculous now, like 7 + bacon?", "content_ms": "Mari cuba sesuatu yang tidak masuk akal sekarang, seperti 7 + daging?"}, {"role": "assistant", "content": "I'm sorry for any confusion, but \"bacon\" is not a number. Therefore, I can't perform mathematical operations with it.", "content_ms": "Saya memohon maaf atas sebarang kekeliruan, tetapi \"bacon\" bukan nombor. Oleh itu, saya tidak boleh melakukan operasi matematik dengannya."}, {"role": "user", "co

In [13]:
!wc -l mixtral-conversation-stupid.jsonl

60384 mixtral-conversation-stupid.jsonl


In [14]:
!ls -lh mixtral-conversation-stupid.jsonl

-rw-r--r-- 1 husein husein 188M Dis  27 23:14 mixtral-conversation-stupid.jsonl


In [15]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mixtral-conversation-math-stupid.jsonl',
    path_in_repo='mixtral-conversation-math-stupid.jsonl',
    repo_id='mesolitica/mixtral-malaysian-general-qa',
    repo_type='dataset',
)

mixtral-conversation-math-stupid.jsonl:   0%|          | 0.00/117M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/commit/c1f57ae4da565c85bc9aa4d3468ec5bd73723e7a', commit_message='Upload mixtral-conversation-math-stupid.jsonl with huggingface_hub', commit_description='', oid='c1f57ae4da565c85bc9aa4d3468ec5bd73723e7a', pr_url=None, pr_revision=None, pr_num=None)