In [1]:
# !wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav
# !wget https://huggingface.co/datasets/mesolitica/malaysian-youtube-audio-instructions/resolve/main/mixtral-audio-instruction.jsonl
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/combine/combined-malaysian-sft-20k-sample.jsonl

In [2]:
import os

os.environ['HF_HOME'] = '/home/husein/ssd3'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/refs/heads/master/llm-instruction/malaysian-sft/malaysian_sft.py

In [4]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, AutoConfig, AutoModelForCausalLM
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
from malaysian_sft import accept
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

In [5]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
audio_token_id = processor.tokenizer._convert_token_to_id_with_added_voc('<|AUDIO|>')
pad_token_id = processor.tokenizer.pad_token_id
tokenizer = processor.tokenizer
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 4096

In [6]:
df = pd.read_parquet('sample_filtered_gpt_omni-00000-of-00001.parquet')

In [7]:
data = []
for i in tqdm(range(len(df))):
    conversation = [
        {"role": "system", "content": df.iloc[i]['system']},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": df.iloc[i]['sliced_audio_filename']},
        ]},
        {"role": "assistant", "content": df.iloc[i]['answer']},
    ]
    text = processor.apply_chat_template(conversation, tokenize=False)
    data.append({
        'text': text,
        'audio': df.iloc[i]['sliced_audio_filename'],
        'input_ids': np.array([]).astype(np.uint32),
        'position_ids': np.array([]).astype(np.uint32),
        'attention_mask': np.array([]).astype(np.uint32),
    })

100%|█████████████████████████████████████████████████████████████████████████████████| 99997/99997 [00:16<00:00, 6018.34it/s]


In [8]:
df = pd.read_parquet('random_question_chunks-00000-of-00001.parquet')

In [9]:
for i in tqdm(range(len(df))):
    conversation = [
        {"role": "system", "content": df.iloc[i]['system']},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": df.iloc[i]['sliced_audio_filename']},
        ]},
        {"role": "assistant", "content": df.iloc[i]['answer']},
    ]
    text = processor.apply_chat_template(conversation, tokenize=False)
    data.append({
        'text': text,
        'audio': df.iloc[i]['sliced_audio_filename'],
        'input_ids': np.array([]).astype(np.uint32),
        'position_ids': np.array([]).astype(np.uint32),
        'attention_mask': np.array([]).astype(np.uint32),
    })

100%|█████████████████████████████████████████████████████████████████████████████████| 76524/76524 [00:15<00:00, 5046.63it/s]


In [10]:
import json
import random

system_ms = [
    'You are a polite and helpful assistant that use voice to communicate with the user, always reply in Malay',
    'always reply in malay, awak adalah voice chatbot yg sangat membantu dan communicate dengan orang menggunakan suara',
    'u are a voice assistant that u use voice to communicate, always reply in malay',
    'sentiasa balas dalam bahasa Melayu, awak adalah assistant suara yang menggunakan teknologi suara sintesis untuk berkomunikasi',
]

count = 0
with open('mixtral-audio-instruction.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        q = l['chat'][0]['content_ms']
        
        a = l['chat'][1]['content_ms']
            
        if q is None:
            continue
            
        if a is None:
            continue
        
        if not accept(q):
            continue
    
        if not accept(a):
            continue
            
        conversation = [
            {"role": "system", "content": random.choice(system_ms)},
            {"role": "user", "content": [
                {"type": "audio", "audio_url": l['filename']},
                {"type": "text", "text": q},
            ]},
            {"role": "assistant", "content": a},
        ]
        text = processor.apply_chat_template(conversation, tokenize=False)
        data.append({
            'text': text,
            'audio': l['filename'],
            'input_ids': np.array([]).astype(np.uint32),
            'position_ids': np.array([]).astype(np.uint32),
            'attention_mask': np.array([]).astype(np.uint32),
        })
        count += 1
        if count > 50000:
            break

83103it [00:42, 1947.16it/s]


In [11]:
folders = sorted(glob('tokenized-4k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-4k/tokenized-0',
 'tokenized-4k/tokenized-1',
 'tokenized-4k/tokenized-2',
 'tokenized-4k/tokenized-3',
 'tokenized-4k/tokenized-4',
 'tokenized-4k/tokenized-5',
 'tokenized-4k/tokenized-6',
 'tokenized-4k/tokenized-7',
 'tokenized-4k/tokenized-8',
 'tokenized-4k/tokenized-9',
 'tokenized-4k/tokenized-10',
 'tokenized-4k/tokenized-11',
 'tokenized-4k/tokenized-12',
 'tokenized-4k/tokenized-13',
 'tokenized-4k/tokenized-14',
 'tokenized-4k/tokenized-15',
 'tokenized-4k/tokenized-16',
 'tokenized-4k/tokenized-17']

In [12]:
len(data)

226522

In [14]:
!rm -rf packing-4k

In [15]:
with MDSWriter(out='packing-4k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass
    
    for d in tqdm(data):
        out.write(d)

100%|██████████████████████████████████████████████████████████████████████████████████| 5253/5253 [00:00<00:00, 15395.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15367/15367 [00:01<00:00, 13458.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 7390/7390 [00:00<00:00, 15311.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 18664/18664 [00:01<00:00, 13917.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 4046/4046 [00:00<00:00, 13867.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1396/1396 [00:00<00:00, 10340.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1402/1402 [00:00<00:00, 10588.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 2078/2078 [00:00<00:00

In [16]:
dataset = LocalDataset('packing-4k')
dataset[-1]

{'attention_mask': array([], dtype=uint32),
 'audio': 'output-audio/0-2758-27.mp3',
 'input_ids': array([], dtype=uint32),
 'position_ids': array([], dtype=uint32),
 'text': '<|im_start|>system\nsentiasa balas dalam bahasa Melayu, awak adalah assistant suara yang menggunakan teknologi suara sintesis untuk berkomunikasi<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nApakah peranan Allah dalam menentukan rezeki seseorang (nafkah atau rezeki)?<|im_end|>\n<|im_start|>assistant\nDalam petikan yang anda sediakan, penceramah menyatakan kepercayaan bahawa "rezeki" (rezeki atau rezeki) mereka berada di tangan Allah, dewa Muslim. Mereka mencadangkan bahawa apa sahaja yang dilakukan, jika Allah menghendaki ia berlaku, ia akan berlaku. Penceramah juga menasihati penonton mereka untuk melindungi dan menjaga orang yang bekerja untuk mereka, di dalam dan di luar pejabat.\n\nDalam akidah Islam, Allah dilihat sebagai pemberi rezeki dan penghidupan yang paling utama. Umat Isl

In [23]:
dataset[0]['input_ids'].shape

(4096,)

In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing-4k",
    repo_id="huseinzol05/malaysian-audio-qa-pretraining",
    repo_type="dataset",
)

shard.00000.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00001.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 79 LFS files:   0%|          | 0/79 [00:00<?, ?it/s]

shard.00002.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00003.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00004.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00005.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00006.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00007.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00008.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00009.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00010.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00011.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00012.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00013.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00014.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00015.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00016.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00017.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00018.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00019.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00020.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00021.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]