In [1]:
import pandas as pd
from tqdm import tqdm
import re
import json
import string

digits = set(string.digits)
rejected = ['\'', '"', 'http', '\n', '[', ']', '/', '`']

def contains_non_ascii(text):
    return any(ord(char) > 127 for char in text)

def reject_q(q):
    if q is None:
        return True
    if any([c in q for c in rejected]):
        return True
    if contains_non_ascii(q):
        return True
    if len(set(q) & digits):
        return True
    if len(q) < 20:
        return True
    if len(q) > 200:
        return True
    return False

def reject_a(a):
    if a is None:
        return True
    if '------' in a:
        return True
    return False

def contains_tamil(text):
    tamil_pattern = re.compile(r'[\u0B80-\u0BFF]')
    return bool(tamil_pattern.search(text))

def contains_mandarin(text):
    return any(
        '\u4E00' <= char <= '\u9FFF' or
        '\u3400' <= char <= '\u4DBF' or
        '\U00020000' <= char <= '\U0002EBEF'
        for char in text
    )

In [2]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    if row['prompt_input'] is not None and len(row['prompt_input']):
        chat.append({'role': 'system', 'content': row['prompt_input'].strip()})
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    return chat

In [11]:
mixtral_conversation = []
already = set()

global_break = False
with open('mixtral-conversation-stupid.jsonl') as fopen:
    for l in fopen:
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            
            mixtral_conversation.append((l[i: i + 2], l))
            already.add(q)
            if len(mixtral_conversation) >= 50000:
                global_break = True
                break
        
global_break = False           
with open('mixtral-critics-malaysia-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            mixtral_conversation.append((l[i: i + 2], l))
            already.add(q)
            if len(mixtral_conversation) >= 150000:
                global_break = True
                break
                
global_break = False           
with open('mixtral-critics-politician-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            mixtral_conversation.append((l[i: i + 2], l))
            already.add(q)
            if len(mixtral_conversation) >= 250000:
                global_break = True
                break
                
len(mixtral_conversation)

103242it [00:08, 11716.87it/s]
135770it [00:13, 9864.33it/s] 


241115

In [27]:
import copy

multiturn = {}
for i in tqdm(mixtral_conversation):
    
    ori_q = i[0][0]['content_ms']
    i = copy.deepcopy(i)
    
    d = []
    for i_ in i[1]:
        q = i_.pop('content_ms')
        if q == ori_q:
            i_['content'] = [{"type": "audio", "audio_url": "audio.wav"}]
        else:
            i_['content'] = q
        d.append(i_)
    multiturn[ori_q] = json.dumps(d)
len(multiturn)

100%|██████████████████████████████████████████████████████████████████████████████| 241115/241115 [00:12<00:00, 19737.10it/s]


241115

In [28]:
ori_q

'Mengapa terdapat protes yang meluas dan rasa tidak puas hati orang ramai semasa Najib Razak memegang jawatan Perdana Menteri, dan bagaimana pentadbirannya bertindak balas terhadap protes ini?'

In [29]:
multiturn[ori_q]

'[{"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "Semasa Najib Razak memegang jawatan Perdana Menteri Malaysia dari 2009 hingga 2018, terdapat beberapa contoh protes yang meluas dan rasa tidak puas hati orang ramai. Berikut adalah beberapa sebab utama dan tindak balas kerajaan:\\n\\n1. Bersih Rallies: Pergerakan Bersih (Pemprosesan untuk Pilihan Raya Bersih dan Adil) menganjurkan beberapa perhimpunan yang menuntut pembaharuan pilihan raya, termasuk pilihan raya bebas dan adil, daftar pemilih bersih dan hak untuk mengundi semua rakyat. Perhimpunan Bersih pertama pada tahun 2007 menyaksikan peratusan keluar mengundi besar-besaran lebih 40,000 orang, walaupun kerajaan mengharamkan. Perhimpunan Bersih seterusnya pada 2011, 2012, dan 2015 menyaksikan lebih ramai orang, dengan puluhan ribu orang membantah keengganan kerajaan untuk memperbaharui sistem pilihan raya. Sambutan kerajaan terhadap perhimpunan ini selalunya berat, dengan

In [30]:
with open('fix-instructions-mixtral-multiturn.json', 'w') as fopen:
    json.dump(multiturn, fopen)

In [31]:
!ls -lh fix-instructions-mixtral-multiturn.json

-rw-rw-r-- 1 husein husein 1.6G Feb  15 22:35 fix-instructions-mixtral-multiturn.json
