In [1]:
from malaysian_sft import accept, post_accept
from collections import defaultdict
import pandas as pd
import os
import json
import random
import re
import malaysian_sft
import string
from tqdm import tqdm

def reject_translate(text):
    l = text.lower()
    if 'terjemah' in l or 'translate' in l:
        return True

def generate_and_tokenize_prompt(row, validate = True):
    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except BaseException:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human)
            outputs.append(bot)
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    chat = []
    if row['prompt_input'] is not None and len(row['prompt_input']):
        chat.append({'role': 'system', 'content': row['prompt_input'].strip()})
    for input, output in zip(inputs, outputs):
        chat.extend([
            {'role': 'user', 'content': input.strip()},
            {'role': 'assistant', 'content': output.strip()},
        ])
    if validate:
        for c in chat:
            if not accept(c['content']):
                return
    return chat

In [2]:
df = pd.read_parquet('/home/husein/ssd4/llama2/malaysian_ultrachat-00000-of-00001.parquet')
df.head()

Unnamed: 0,prompt_input,input,output
0,,<manusia>: English \n\n\n\nconcepts of patria...,
1,,<manusia>: © 2022 Sains Insani \n\n\n\n\n\n\n...,
2,,<manusia>: a ini. Sumbang saran dilakukan supa...,
3,,<manusia>: 10: 3-4 (2018) 27–33 | www.sainshum...,
4,,<manusia>: they lost their freedom and rights...,


In [3]:
filter_ultrachat = []
for i in tqdm(range(len(df))):
    l = generate_and_tokenize_prompt(df.iloc[i].to_dict(), validate = False)
    valid = True
    for no, l_ in enumerate(l):
        left = re.sub('[^a-z ]+', ' ', l_['content'][:20].lower())
        left = re.sub(r'[ ]+', ' ', left).strip()
        right = re.sub('[^a-z ]+', ' ', l[no - 1]['content'][:20].lower())
        right = re.sub(r'[ ]+', ' ', right).strip()
        if left[:10] == right[:10]:
            valid = False
            break
        
        if l_['role'] == 'assistant' and not post_accept(l_['content']):
            valid = False
            break
            
    if not valid:
        continue
    l_ = json.dumps(l).lower()
    if 'dalam konteks di' in l_:
        continue
    if 'terjemah' in l_:
        continue
    if 'translate' in l_:
        continue
    if 'artikel itu' in l_:
        continue
    filter_ultrachat.append(l)
    
len(filter_ultrachat)

100%|█████████████████████████████████| 149054/149054 [01:32<00:00, 1618.87it/s]


140308

In [4]:
speakers = pd.read_parquet('dedup-malaysian-podcasts.parquet').to_dict(orient = 'records')
parliaments = pd.read_parquet('dedup-malaysia-parliament.parquet').to_dict(orient = 'records')
parliaments = random.sample(parliaments, 100000)
speakers.extend(parliaments)
len(speakers)

173384

In [5]:
random.shuffle(speakers)

In [6]:
import copy

ultrachat = []
index = 0
for l in tqdm(filter_ultrachat):
    l = copy.deepcopy(l)
    first = l[0]['content'].split('\n')
    if len(first[-1].split()) < 5:
        continue
    if len(first) > 1:
        l[0] = {
            'role': 'user',
            'content': [
                {
                    'type': 'text',
                    'text': '\n'.join(first[:-1])
                },
                {
                    'type': 'text',
                    'text': first[-1]
                }  
            ]
        }
    
    questions = []
    lens = []
    for l_ in l:
        if l_['role'] != 'user':
            continue
        if isinstance(l_['content'], list):
            q = l_['content'][1]['text']
        else:
            q = l_['content']
        q_ = [c for c in q if c in string.ascii_uppercase]
        questions.append(len(q_) / len(q))
        lens.append(len(q))
    
    if any([r > 0.07 for r in questions]):
        continue
        
    if any([l > 350 for l in lens]):
        continue
        
    ultrachat.append({
        'voice': speakers[index],
        'chat': l
    })
    index += 1

100%|████████████████████████████████| 140308/140308 [00:04<00:00, 34754.32it/s]


In [7]:
len(ultrachat)

101937

In [13]:
voices = []
for row in ultrachat:
    voice = row['voice']
    questions = []
    for l_ in row['chat']:
        if l_['role'] != 'user':
            continue
        if isinstance(l_['content'], list):
            q = l_['content'][1]['text']
        else:
            q = l_['content']
        questions.append(q)
        
    for q in questions:
    
        voices.append({
            'voice': voice,
            'question': q
        })
        
len(voices)

369386

In [17]:
with open('prepared-malay-ultrachat-voices.json') as fopen:
    voices_ = json.load(fopen)
len(voices_)

369386

In [10]:
with open('prepared-malay-ultrachat.json', 'w') as fopen:
    json.dump(ultrachat, fopen)

In [16]:
with open('prepared-malay-ultrachat-voices.json', 'w') as fopen:
    json.dump(voices, fopen)

In [None]:
from dia.model import Dia

In [None]:
model = Dia.from_pretrained("mesolitica/Malaysian-Dia-1.6B", compute_dtype="float16")

In [None]:
voices[1000]

In [None]:
i = 100
clone_from_text = f"[S1] {voices[i]['voice']['transcription']}"
clone_from_audio = voices[i]['voice']['audio']
t_ = voices[i]['question']
text = clone_from_text + '[S1] ' + t_.strip()
texts = [text] * 1
clone_from_audios = [clone_from_audio] * len(texts)
t_

In [None]:
len(t_)

In [None]:
output = model.generate(texts, audio_prompt=clone_from_audios, use_torch_compile=True, verbose=True, 
                        max_tokens=3000, temperature = 1.0, cfg_scale = 1.0)

In [None]:
import IPython.display as ipd
ipd.Audio(output, rate = 44100)

In [None]:
from glob import glob
import json
import os

files = sorted(glob('ultrachat-speech/*.json'))
len(files)

In [None]:
with open(files[351]) as fopen:
    d = json.load(fopen)
d

In [None]:
import IPython.display as ipd
ipd.Audio(d['filename_audio'], rate = 44100)

In [None]:
os.path.getsize(files[0].replace('.json', '.mp3'))

In [None]:
rejected = []
for f in files:
    try:
        if os.path.getsize(f.replace('.json', '.mp3')) <= 10000:
            rejected.append(f)
    except:
        pass

In [None]:
len(rejected)

In [None]:
sizes = []
for f in files:
    try:
        new_f = f.replace('.json', '.mp3')
        sizes.append((new_f, os.path.getsize(new_f)))
    except:
        pass
    
len(sizes)

In [None]:
sorted(sizes, key = lambda x: x[1])

In [None]:
import IPython.display as ipd
ipd.Audio('ultrachat-speech/96731.mp3')