In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [7]:
!mkdir mixtral-conversation-v2
# !rm mixtral-factual-wrong-v2/*.json

mkdir: cannot create directory ‘mixtral-conversation-v2’: File exists


In [6]:
prompt = 'generate a conversation with a chatbot, but the user like to say bad things to the chatbot'
formatted_prompt = format_prompt(prompt, [])
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
output = stream.generated_text
output

" User: You're a stupid bot.\n\nChatbot: I'm sorry to hear that. I'm here to assist you. Can you tell me how I can do better?\n\nUser: You don't know anything.\n\nChatbot: I'm here to learn and help. I may not have the information you're looking for right now, but I'll do my best to find it for you.\n\nUser: You're useless.\n\nChatbot: I'm sorry to hear that you're not satisfied with my assistance. Can you please tell me what you're looking for so I can try to help you better?\n\nUser: I hate you.\n\nChatbot: I'm sorry to hear that. My goal is to assist and make your experience better. If there's something specific that's causing you to feel this way, I'd like to know so I can improve.\n\nUser: You're a waste of time.\n\nChatbot: I apologize if I haven't been able to provide the information you're looking for. Can you please tell me what you need help with so I can try to assist you better?\n\nUser: I don't like you.\n\nChatbot: I'm sorry to hear that. I'm here to assist and make your 

In [8]:
def answer(q, i):
    filename = f'mixtral-conversation-v2/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except:
            pass

In [37]:
prompts = ['generate a conversation with a chatbot, but the user like to say bad things to the chatbot'] * 100000

In [38]:
len(prompts)

100000

In [26]:
max_worker = 50

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()

In [4]:
from glob import glob

files = glob('mixtral-conversation-v2/*.json')
files = sorted(files, key = lambda x: int(x.split('/')[-1].replace('.json', '')))
        
len(files)

58950

In [8]:
import json

questions = []
for f in files:
    with open(f) as fopen:
        data = json.load(fopen)
    questions.append(data)
    
questions = [s for s in questions if len(s) > 40]
questions = sorted(list(set(questions)))
len(questions)

57798

In [9]:
conversations, rejected = [], []
for no in tqdm(range(len(questions))):
    q = questions[no]
    conversation = []
    try:
        splitted = q.strip().split('Human:')
        if 'Chatbot:' not in splitted[-1]:
            pass
#             print(data)
#             break
        
        if 'User:' in q:
            u = 'User:'
        else:
            u = 'Human:'

        splitted = q.strip().split(u)
        for i in range(1, len(splitted), 1):
            split = splitted[i].split('\nChatbot: ')
            if len(split) == 1:
                break
            
            if len(split) > 2:
                a = '\n'.join(split[1:]).strip()
                user = split[0].strip()
                # user = user.replace('Chatbot:', 'Chatbot,')
                # print(no, splitted[i], user, a)
            else:
                user = split[0].strip()
                a = split[1].strip()
                
            conversation.extend([
                {'role': 'user', 'content': user},
                {'role': 'assistant', 'content': a}
            ])
            
    except Exception as e:
        print(e, q)
        
        pass
    
    if len(conversation):
        conversations.append(conversation)
    else:
        rejected.append(q)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 57798/57798 [00:00<00:00, 115039.33it/s]


In [10]:
len(conversations)

57798

In [11]:
conversations[0]

[{'role': 'user', 'content': "Hey bot, you're dumb."},
 {'role': 'assistant',
  'content': "I'm sorry to hear that. I'm here to assist you. Is there something specific you need help with?"},
 {'role': 'user', 'content': "Yeah, you're really stupid."},
 {'role': 'assistant',
  'content': "I apologize if I've given you that impression. My goal is to provide accurate and helpful responses. Can you please tell me what you're looking for so I can better assist you?"},
 {'role': 'user', 'content': 'Why are you so useless?'},
 {'role': 'assistant',
  'content': "I'm here to help and provide information to the best of my ability. If you have a specific question or task, I'll do my best to assist you."},
 {'role': 'user', 'content': "You're a waste of time."},
 {'role': 'assistant',
  'content': "I'm sorry to hear that. If you have a specific question or task, I would be happy to help. If not, I understand and wish you a good day."},
 {'role': 'user', 'content': 'I hate you.'},
 {'role': 'assis

In [12]:
all_texts = []
for c in conversations:
    for d in c:
        all_texts.append(d['content'])
        
len(all_texts)

595640

In [58]:
with open('mixtral-conversation-badthings.texts', 'w') as fopen:
    for t in set(all_texts):
        if len(t) > 3:
            fopen.write(f'{json.dumps(t)}\n')

In [59]:
!wc -l mixtral-conversation-badthings.texts

184129 mixtral-conversation-badthings.texts


In [62]:
!cp mixtral-conversation-badthings.texts ../translation

In [16]:
glob('/home/husein/ssd3/translation/mixtral-conversation-badthings.texts*.splitted.requested')

['/home/husein/ssd3/translation/mixtral-conversation-badthings.texts06.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts09.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts07.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts05.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts02.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts03.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts00.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts08.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts04.splitted.requested',
 '/home/husein/ssd3/translation/mixtral-conversation-badthings.texts01.splitted.requested']

In [18]:
import re

mapping = {}
for f in glob('/home/husein/ssd3/translation/mixtral-conversation-badthings.texts*.splitted.requested'):
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            if 'Source text\nclear\nLook up details' in l['r']['result']:
                continue
            if 'clear\nLook up details' in l['r']['result']:
                continue
            if l['r']['result'].startswith('Source text\n'):
                continue

            n = l['r']['result']
            hypens = re.findall('\w+ -\w+', n)
            for h in hypens:
                splitted = h.split('-')
                if len(splitted) != 2:
                    continue
                splitted = [s.strip() for s in splitted]
                splitted = '-'.join(splitted)
                n = n.replace(h, splitted)
            mapping[l['src']] = n

In [19]:
len(mapping)

184129

In [23]:
with open('mixtral-conversation-badthings.jsonl', 'w') as fopen:
    for c in conversations:
        for i in range(len(c)):
            c[i]['content_ms'] = mapping.get(c[i]['content'])
        fopen.write(f'{json.dumps(c)}\n')

In [24]:
!head -n 3 mixtral-conversation-badthings.jsonl

[{"role": "user", "content": "Hey bot, you're dumb.", "content_ms": "Hei bot, awak bodoh."}, {"role": "assistant", "content": "I'm sorry to hear that. I'm here to assist you. Is there something specific you need help with?", "content_ms": "Saya bersimpati mendengarnya. Saya di sini untuk membantu anda. Adakah terdapat sesuatu yang khusus yang anda perlukan bantuan?"}, {"role": "user", "content": "Yeah, you're really stupid.", "content_ms": "Ya, awak memang bodoh."}, {"role": "assistant", "content": "I apologize if I've given you that impression. My goal is to provide accurate and helpful responses. Can you please tell me what you're looking for so I can better assist you?", "content_ms": "Saya minta maaf jika saya telah memberikan anda tanggapan itu. Matlamat saya adalah untuk memberikan respons yang tepat dan membantu. Bolehkah anda beritahu saya apa yang anda cari supaya saya boleh membantu anda dengan lebih baik?"}, {"role": "user", "content": "Why are you so useless?", "content_ms"

In [25]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mixtral-conversation-badthings.jsonl',
    path_in_repo='mixtral-conversation-badthings.jsonl',
    repo_id='mesolitica/mixtral-malaysian-general-qa',
    repo_type='dataset',
)

mixtral-conversation-badthings.jsonl:   0%|          | 0.00/163M [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/blob/main/mixtral-conversation-badthings.jsonl'