In [1]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/chatgpt4_malaysian_general_qa-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/malaysian_ultrachat-00000-of-00001.parquet

In [2]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-malay-instructions/resolve/main/synthetic-alpaca_data_cleaned.jsonl

In [3]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-malay-instructions/resolve/main/synthetic-code-instructions.jsonl

In [4]:
# !wget https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/resolve/main/mixtral-conversation-stupid.jsonl
# !wget https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/resolve/main/mixtral-critics-malaysia-multiturn.jsonl
# !wget https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/resolve/main/mixtral-critics-politician-multiturn.jsonl

In [5]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-malay-instructions/resolve/main/synthetic-oss_instruct-decontaminated.jsonl

In [6]:
# !wget https://huggingface.co/datasets/mesolitica/mixtral-malaysian-general-qa/resolve/main/mixtral-factual-wrong-v2.jsonl

In [7]:
# !head -n 1 mixtral-factual-wrong-v2.jsonl

In [8]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/force_tamil-00000-of-00001.parquet

In [9]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/force_mandarin-00000-of-00001.parquet

In [10]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/data/force_jawi-00000-of-00001.parquet

In [11]:
import pandas as pd
from tqdm import tqdm
import re
import json
import string

digits = set(string.digits)
rejected = ['\'', '"', 'http', '\n', '[', ']', '/', '`']

def contains_non_ascii(text):
    return any(ord(char) > 127 for char in text)

def reject_q(q):
    if q is None:
        return True
    if any([c in q for c in rejected]):
        return True
    if contains_non_ascii(q):
        return True
    if len(set(q) & digits):
        return True
    if len(q) < 20:
        return True
    if len(q) > 200:
        return True
    return False

def reject_a(a):
    if a is None:
        return True
    if '------' in a:
        return True
    return False

def contains_tamil(text):
    tamil_pattern = re.compile(r'[\u0B80-\u0BFF]')
    return bool(tamil_pattern.search(text))

def contains_mandarin(text):
    return any(
        '\u4E00' <= char <= '\u9FFF' or
        '\u3400' <= char <= '\u4DBF' or
        '\U00020000' <= char <= '\U0002EBEF'
        for char in text
    )

In [12]:
def generate_and_tokenize_prompt(row):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    if row['prompt_input'] is not None and len(row['prompt_input']):
        chat.append({'role': 'system', 'content': row['prompt_input'].strip()})
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    return chat

In [13]:
df = pd.read_parquet('chatgpt4_malaysian_general_qa-00000-of-00001.parquet')

filtered = []
for i in range(len(df)):
    q = df['input'].iloc[i]
    
    if reject_q(q):
        continue
    
    filtered.append(('chatgpt4_malaysian_general_qa', df.iloc[i].to_dict()))
    
len(filtered)

29122

In [14]:
df = pd.read_parquet('malaysian_ultrachat-00000-of-00001.parquet')

filter_ultrachat = []
for i in tqdm(range(len(df))):
    l = generate_and_tokenize_prompt(df.iloc[i].to_dict())
    valid = True
    for no, l_ in enumerate(l):
        left = re.sub('[^a-z ]+', ' ', l_['content'][:20].lower())
        left = re.sub(r'[ ]+', ' ', left).strip()
        right = re.sub('[^a-z ]+', ' ', l[no - 1]['content'][:20].lower())
        right = re.sub(r'[ ]+', ' ', right).strip()
        if left[:10] == right[:10]:
            valid = False
            break
    if not valid:
        continue
    l_ = json.dumps(l).lower()
    if 'dalam konteks di' in l_:
        continue
    if 'terjemah' in l_:
        continue
    if 'artikel itu' in l_:
        continue
        
    q = l[0]['content'].split('\n')[-1]
    if reject_q(q):
        continue
    
    filter_ultrachat.append(('malaysian_ultrachat', l))
    
len(filter_ultrachat)

100%|███████████████████████████████████████████████████████████████████████████████| 149054/149054 [00:17<00:00, 8461.48it/s]


105238

In [15]:
alpaca = []
with open('synthetic-alpaca_data_cleaned.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        if l['indon_output']:
            continue
        if l['rejected_output']:
            continue
        if l['output_ms'] is None:
            continue
        q = l['instruction']
        
        if reject_q(q):
            continue
            
        if reject_a(l['output_ms']):
            continue
    
        alpaca.append(('malaysian_alpaca', l))
            
len(alpaca)

51738it [00:01, 37013.95it/s]


19000

In [16]:
synthetic_code = []
with open('synthetic-code-instructions.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        if l['indon_output']:
            continue
        if l['rejected_output']:
            continue
        if l['output_ms'] is None:
            continue
        if '```' not in l['output_ms']:
            continue
        q = l['instruction']
        if reject_q(q):
            continue

        synthetic_code.append(('synthetic_coding', l))
        
len(synthetic_code)

111920it [00:03, 36717.13it/s]


3550

In [17]:
mixtral_conversation = []
already = set()

global_break = False
with open('mixtral-conversation-stupid.jsonl') as fopen:
    for l in fopen:
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            
            mixtral_conversation.append(('mixtral_conversation_stupid', l[i: i + 2]))
            already.add(q)
            if len(mixtral_conversation) >= 50000:
                global_break = True
                break
        
global_break = False           
with open('mixtral-critics-malaysia-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            mixtral_conversation.append(('mixtral_critis_malaysia', l[i: i + 2]))
            already.add(q)
            if len(mixtral_conversation) >= 150000:
                global_break = True
                break
                
global_break = False           
with open('mixtral-critics-politician-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):
        if global_break:
            break
        l = json.loads(l)
        if len(l) % 2 != 0:
            l = l[:-1]
        for i in range(0, len(l), 2):
            q = l[i]['content_ms']
            a = l[i + 1]['content_ms']
            if reject_q(q):
                continue
            if reject_a(a):
                continue
            if q in already:
                continue
            mixtral_conversation.append(('mixtral_critis_politician', l[i: i + 2]))
            already.add(q)
            if len(mixtral_conversation) >= 250000:
                global_break = True
                break
                
len(mixtral_conversation)

103242it [00:08, 11873.85it/s]
135770it [00:12, 10509.67it/s]


241115

In [18]:
factual_wrong = []
with open('mixtral-factual-wrong-v2.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        q = l['question_ms']
        a = l['answer_ms']
        if reject_q(q):
            continue
        if reject_a(a):
            continue
        factual_wrong.append(('mixtral_factually_wrong', l))
        
len(factual_wrong)

38937

In [19]:
df = pd.read_parquet('force_tamil-00000-of-00001.parquet')
forces = []
for i in tqdm(range(len(df))):
    row = generate_and_tokenize_prompt(df.iloc[i].to_dict())
    q = row[0]['content']
    if contains_tamil(q):
        continue
    if reject_q(q.replace('\n', '')):
        continue
    if len(row) == 2 and len(row[-1]['content']) < 100:
        continue
    forces.append(('force_tamil', row))
len(forces)

100%|████████████████████████████████████████████████████████████████████████████████| 52144/52144 [00:01<00:00, 40360.09it/s]


57

In [20]:
df = pd.read_parquet('force_mandarin-00000-of-00001.parquet')
for i in tqdm(range(len(df))):
    row = generate_and_tokenize_prompt(df.iloc[i].to_dict())
    q = row[0]['content']
    if contains_mandarin(q):
        continue
    if reject_q(q.replace('\n', '')):
        continue
    if len(row) == 2 and len(row[-1]['content']) < 100:
        continue
    forces.append(('force_mandarin', row))
len(forces)

100%|██████████████████████████████████████████████████████████████████████████████| 146811/146811 [00:06<00:00, 24124.16it/s]


1464

In [21]:
df = pd.read_parquet('force_jawi-00000-of-00001.parquet')
for i in tqdm(range(len(df))):
    row = generate_and_tokenize_prompt(df.iloc[i].to_dict())
    q = row[0]['content']
    if contains_mandarin(q):
        continue
    if reject_q(q.replace('\n', '')):
        continue
    if len(row) == 2 and len(row[-1]['content']) < 100:
        continue
    forces.append(('force_jawi', row))
len(forces)

100%|██████████████████████████████████████████████████████████████████████████████| 371885/371885 [00:20<00:00, 18217.03it/s]


108270

In [22]:
instructions = {}
for f in filtered:
    type, f = f
    prompt = [
        {'role': 'user', 'content': [
            {"type": "audio", "audio_url": "audio.wav"},
        ]},
        {'role': 'assistant', 'content': f['output']},
    ]
    instructions[json.dumps(prompt)] = type

In [23]:
for f in filter_ultrachat:
    type, f = f
    q = f[0]['content'].split('\n')[-1]
    prompt = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "audio.wav"},
            {"type": "text", "text": '\n'.join(f[0]['content'].split('\n')[:-1])},
        ]}
    ]
    prompt.extend(f[1:])
    instructions[json.dumps(prompt)] = type

In [24]:
for f in alpaca:
    type, f = f
    prompt = [
        {'role': 'user', 'content': [
            {"type": "audio", "audio_url": "audio.wav"},
        ]},
        {'role': 'assistant', 'content': f['output_ms']},
    ]
    instructions[json.dumps(prompt)] = type

In [25]:
for f in synthetic_code:
    type, f = f
    prompt = [
        {'role': 'user', 'content': [
            {"type": "audio", "audio_url": "audio.wav"},
        ]},
        {'role': 'assistant', 'content': f['output_ms']},
    ]
    instructions[json.dumps(prompt)] = type

In [26]:
for f in mixtral_conversation:
    type, f = f
    accept = True
    for f_ in f:
        if f_['content_ms'] is None:
            accept = False
    if not accept:
        continue
        
    prompt = [
        {'role': 'user', 'content': [
            {"type": "audio", "audio_url": "audio.wav"},
        ]},
        {'role': 'assistant', 'content': f[1]['content_ms']},
    ]
    
    instructions[json.dumps(prompt)] = type

In [27]:
for f in factual_wrong:
    type, f = f
    prompt = [
        {'role': 'user', 'content': [
            {"type": "audio", "audio_url": "audio.wav"},
        ]},
        {'role': 'assistant', 'content': f['answer_ms']},
    ]
    instructions[json.dumps(prompt)] = type

In [28]:
for f in forces:
    type, f = f
    q = f[0]['content'].replace('\n\n', ', ')
    prompt = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "audio.wav"},
        ]}
    ]
    prompt.extend(f[1:])
    instructions[json.dumps(prompt)] = type

In [29]:
len(instructions)

539097

In [30]:
with open('instructions-keys.json', 'w') as fopen:
    json.dump(instructions, fopen)