In [1]:
# !wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav
# !wget https://huggingface.co/datasets/mesolitica/malaysian-youtube-audio-instructions/resolve/main/mixtral-audio-instruction.jsonl
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/combine/combined-malaysian-sft-20k-sample.jsonl

In [2]:
import os

os.environ['HF_HOME'] = '/home/husein/ssd3'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
# !wget https://raw.githubusercontent.com/mesolitica/malaysian-dataset/refs/heads/master/llm-instruction/malaysian-sft/malaysian_sft.py

In [4]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, AutoConfig, AutoModelForCausalLM
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
from malaysian_sft import accept
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json
import math

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

In [5]:
audio_class = Audio(sampling_rate=16000)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
audio_token_id = processor.tokenizer._convert_token_to_id_with_added_voc('<|AUDIO|>')
pad_token_id = processor.tokenizer.pad_token_id
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 4096

In [6]:
processor.feature_extractor.hop_length

160

In [7]:
# data = []
# df = pd.read_parquet('sample_filtered_gpt_omni-00000-of-00001.parquet')
# for i in tqdm(range(len(df))):
#     conversation = [
#         {"role": "system", "content": df.iloc[i]['system']},
#         {"role": "user", "content": [
#             {"type": "audio", "audio_url": df.iloc[i]['sliced_audio_filename']},
#         ]},
#         {"role": "assistant", "content": df.iloc[i]['answer']},
#     ]
#     text = processor.apply_chat_template(conversation, tokenize=False)
#     data.append({
#         'text': text,
#         'audio': df.iloc[i]['sliced_audio_filename'],
#     })
    
# df = pd.read_parquet('random_question_chunks-00000-of-00001.parquet')
# for i in tqdm(range(len(df))):
#     conversation = [
#         {"role": "system", "content": df.iloc[i]['system']},
#         {"role": "user", "content": [
#             {"type": "audio", "audio_url": df.iloc[i]['sliced_audio_filename']},
#         ]},
#         {"role": "assistant", "content": df.iloc[i]['answer']},
#     ]
#     text = processor.apply_chat_template(conversation, tokenize=False)
#     data.append({
#         'text': text,
#         'audio': df.iloc[i]['sliced_audio_filename'],
#     })

# with open('prepare-mixtral-audio-instruction.json') as fopen:
#     mixtral = json.load(fopen)
    
# for row in tqdm(mixtral):
#     conversation = [
#         {"role": "system", "content": row['system']},
#         {"role": "user", "content": [
#             {"type": "audio", "audio_url": row['audio']},
#             {"type": "text", "text": row['question']},
#         ]},
#         {"role": "assistant", "content": row['answer']},
#     ]
#     text = processor.apply_chat_template(conversation, tokenize=False)
#     data.append({
#         'text': text,
#         'audio': row['audio'],
#     })

# with open('processed.json', 'w') as fopen:
#     json.dump(data, fopen)

In [8]:
with open('processed.json') as fopen:
    data = json.load(fopen)

In [9]:
from datasets import Audio

def collator(batch, batch_position_ids, audio):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
        
    input_ids = input_ids + [pad_token_id] * (sequence_length - len(input_ids))
    position_ids = position_ids + [0] * (sequence_length - len(position_ids))
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': json.dumps(audio),
        'text': '',
    }

def get_new_token(f, sample):

    audio_ = audio_class.decode_example(audio_class.encode_example(f))['array']

#     inputs_audio = processor.feature_extractor(
#         [audio_], 
#         return_attention_mask=True, 
#         padding="max_length", 
#         sampling_rate=16000,
#         return_tensors = 'pt'
#     )

    audio_lengths = [min(3000, math.ceil(len(audio_) / processor.feature_extractor.hop_length))]

    num_audio_tokens = sample.count(audio_token)
    replace_str = []
    while audio_token in sample:
        audio_length = audio_lengths.pop(0)
        input_length = (audio_length - 1) // 2 + 1
        num_audio_tokens = (input_length - 2) // 2 + 1

        expanded_audio_token = audio_token * num_audio_tokens

        audio_token_start_idx = sample.find(audio_token)
        audio_token_end_idx = audio_token_start_idx + len(audio_token)

        has_bos = (
            sample[audio_token_start_idx - len(audio_bos_token) : audio_token_start_idx]
            == audio_bos_token
        )
        has_eos = (
            sample[audio_token_end_idx : audio_token_end_idx + len(audio_eos_token)]
            == audio_eos_token
        )

        if not has_bos and not has_eos:
            expanded_audio_token = audio_bos_token + expanded_audio_token + audio_eos_token

        replace_str.append(expanded_audio_token)
        sample = sample.replace(audio_token, "<placeholder>", 1)

    while "<placeholder>" in sample:
        sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
    
    return sample

In [10]:
!rm -rf tokenized-audio-4096
!mkdir tokenized-audio-4096

In [11]:
import time

def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-audio-4096/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    audio = []
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows, desc=f'loop {index}'):
            prompt = get_new_token(row['audio'], row['text'])
            outputs = tokenizer(prompt, add_special_tokens = False)
            length = len(outputs['input_ids'])
            if length > block_size:
                continue
            
            if count + length > block_size:
                o = collator(temp, position_ids, audio)
                out.write(o)
                temp = [outputs['input_ids']]
                position_ids = [range(len(outputs['input_ids']))]
                audio = [row['audio']]
                count = length
            
            else:
                temp.append(outputs['input_ids'])
                position_ids.append(range(len(outputs['input_ids'])))
                audio.append(row['audio'])
                count += length
        
        if len(temp):
            o = collator(temp, position_ids, audio)
            out.write(o)
    return [1]

In [12]:
# loop((data, 0))

In [13]:
from multiprocess import Pool
import mp

chunks = mp.chunks(data, 20000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

loop 9:   4%|██▊                                                                         | 726/20000 [00:08<03:11, 100.65it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10176 > 8192). Running this sequence through the model will result in indexing errors
loop 8:  38%|████████████████████████████▌                                              | 7612/20000 [00:40<00:58, 211.59it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (10240 > 8192). Running this sequence through the model will result in indexing errors
loop 6: 100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [01:40<00:00, 198.61it/s]
loop 5: 100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [01:40<00:00, 198.06it/s]
loop 7: 100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [01:40<00:00, 198.77it/

In [14]:
folders = glob('tokenized-audio-4096/tokenized-*') + glob('tokenized-4k/tokenized-*')
len(folders)

36

In [15]:
!rm -rf packing-4k

In [16]:
with MDSWriter(out='packing-4k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████████████████████████████████████████████████████████████████████████████| 6527/6527 [00:00<00:00, 13453.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 6445/6445 [00:00<00:00, 12821.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5199/5199 [00:00<00:00, 15520.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 6440/6440 [00:00<00:00, 11385.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1429/1429 [00:00<00:00, 43104.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1580/1580 [00:00<00:00, 11197.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 1266/1266 [00:00<00:00, 9407.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 6434/6434 [00:00<00:00

In [17]:
import torch.nn.functional as F

def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):
    total_size = sum(mask.size(0) for mask in masks)
    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)

    current_pos = 0

    for mask in masks:
        size = mask.size(0)
        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
        current_pos += size

    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min
    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)
    return inverted_mask.unsqueeze(0)

def pad_attention_mask_4d(attention_mask, max_size = 4096):
    maxlen_right = max_size
    maxlen_bottom = max_size
    attention_mask = [
        F.pad(
            attention_mask[i],
            (0, maxlen_right - attention_mask[i].shape[-2], 0, maxlen_bottom - attention_mask[i].shape[-1])) for i in range(
            len(attention_mask))]
    return torch.stack(attention_mask)

In [18]:
def collator(batch):
    batch = [b for b in batch if b is not None] 
    input_ids, attention_mask, position_ids, labels = [], [], [], []
    input_features, feature_attention_mask = [], []

    for b in batch:
        if 'input_features' in b:
            input_features.append(b['input_features'])
            feature_attention_mask.append(b['feature_attention_mask'])
        input_ids.append(b['input_ids'][None])
        attention_mask.append(b['attention_mask'])
        position_ids.append(b['position_ids'][None])
        labels.append(b['labels'][None])

    input_ids = {
        'input_ids': torch.concat(input_ids, 0),
        'attention_mask': pad_attention_mask_4d(attention_mask, sequence_length),
        'position_ids': torch.concat(position_ids, 0),
        'labels': torch.concat(labels, 0),
    }
    if len(input_features):
        input_ids['input_features'] = torch.concat(input_features, 0)
        input_ids['feature_attention_mask'] = torch.concat(feature_attention_mask, 0)

    return input_ids

In [19]:
class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = LocalDataset(local=local)
        self.audio = Audio(sampling_rate=16000)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        try:
            data.pop('text', None)
            audio_files = data.pop('audio', '')
            data['labels'] = data["input_ids"].copy()
            masking = data.pop('attention_mask')

            data.pop('token_type_ids', None)
            for k in data.keys():
                data[k] = torch.tensor(data[k].astype(np.int64))

            masks = []
            for m in masking:
                masks.append(torch.tril(torch.ones(m, m)))
            attention_mask = block_diagonal_concat_inverted(*masks)
            data['attention_mask'] = attention_mask

            data['labels'][data['labels'] == audio_token_id] = -100
            data['labels'][data['labels'] == pad_token_id] = -100

            if len(audio_files):
                files = json.loads(audio_files)
                print(files)
                audios = []
                for f in files:
                    audio = self.audio.decode_example(
                    self.audio.encode_example(f))['array']
                    audios.append(audio)

                inputs_audio = processor.feature_extractor(
                    audios, return_attention_mask=True, 
                    sampling_rate=16000,
                    padding="max_length", return_tensors = 'pt')

                data['input_features'] = inputs_audio['input_features']
                data['feature_attention_mask'] = inputs_audio['attention_mask']
                
            return data

        except Exception as e:
            print('Exception', e)
            return None

    def __len__(self):
        return len(self.dataset)

In [20]:
dataset = DatasetFixed('packing-4k')

In [21]:
batch = [dataset[0], dataset[-1]]

['filter-audio/3-3061-34.mp3', 'filter-audio/3-3061-34.mp3']


In [22]:
b = collator(batch)
b['feature_attention_mask']

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)

In [23]:
b['feature_attention_mask'].sum()

tensor(6000)

In [24]:
(3000 - 1) // 2 + 1

1500

In [25]:
!du -hs packing-4k

7.0G	packing-4k


In [28]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing-4k",
    repo_id="huseinzol05/malaysian-audio-qa-pretraining",
    repo_type="dataset",
)

shard.00069.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00067.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00062.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00061.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00068.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 47 LFS files:   0%|          | 0/47 [00:00<?, ?it/s]

shard.00070.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00071.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00072.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00073.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00074.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00075.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00076.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00077.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00078.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00079.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00080.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00081.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00082.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00083.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00084.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00085.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00086.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00087.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00088.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00089.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00090.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00091.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00092.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00093.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00094.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00095.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00096.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00097.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00098.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00099.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00100.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00101.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00102.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00103.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00104.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00105.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00106.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00107.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00108.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00109.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00110.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00111.mds:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/huseinzol05/malaysian-audio-qa-pretraining/commit/3cf23fc9b84f4366714556f43f40789f56b96541', commit_message='Upload folder using huggingface_hub', commit_description='', oid='3cf23fc9b84f4366714556f43f40789f56b96541', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/huseinzol05/malaysian-audio-qa-pretraining', endpoint='https://huggingface.co', repo_type='dataset', repo_id='huseinzol05/malaysian-audio-qa-pretraining'), pr_revision=None, pr_num=None)