In [1]:
# !wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/combine/combined-malaysian-sft-20k-sample.jsonl

In [2]:
import os

os.environ['HF_HOME'] = '/home/husein/ssd3'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
import librosa
import torch
import torch.nn as nn
import pandas as pd
from datasets import Audio
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, AutoConfig, AutoModelForCausalLM
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
    'audio': 'str',
    'text': 'str'
}
hashes = 'sha1', 'xxh64'

In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
audio_token_id = processor.tokenizer._convert_token_to_id_with_added_voc('<|AUDIO|>')
pad_token_id = processor.tokenizer.pad_token_id
tokenizer = processor.tokenizer
torch_dtype = torch.bfloat16
min_dtype = torch.finfo(torch_dtype).min
sequence_length = 512

In [5]:
combine = []
with open('combined-malaysian-sft-20k-sample.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)
        if len(combine) > 100:
            break

len(combine)

101

In [6]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
        'audio': '',
        'text': '',
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [7]:
!mkdir tokenized-4k

mkdir: cannot create directory ‘tokenized-4k’: File exists


In [8]:
import time

def loop(files, block_size = sequence_length):
    rows, index = files
    out_root = f'tokenized-4k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [9]:
loop((combine[:100], 0))

100%|█████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1072.37it/s]


{'input_ids': array([  8656,  11814,   9810,   3362,    276,    452,  86415,   1728,
         10371,  49935,    272,   1225,    391,   9101,  58554,     74,
          6053,     11,  92619,  67431,  97934,    276,   2953,    709,
            74,   2154,  78549,  10215,   1346,   1574,     75,  43518,
         10582,   1330,   1579,   9101,  14885,  52067,  50977,    278,
         22051,    382,    818,   2267,    595,  12800,    355,    452,
         86415,   1728,    595,  27574,    372,    293,   9247,  78351,
          1207,   5049,   4647,    300,   3101,   1447,     16,     13,
          1207,   3994,    293,   9247,  78351,   1207,   5049,     25,
         19739,    266,   1579,  11806,   1276,    595,  27574,    372,
           708,  73790,  13207,   4554,  29180,   3101,  19593,  10581,
          9810,    276,  19262,   1997,  10215,   1207,   3994,  31811,
          1962,     74,  13229,   2804,  59220,     13,  73767,    708,
         73790,  21249,    391,   2143,  20414,  10

In [10]:
df = pd.read_parquet('sample_filtered_gpt_omni-00000-of-00001.parquet')

In [11]:
!rm -rf audio-out

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
with MDSWriter(out='audio-out', columns=columns, compression=None, hashes=hashes) as out:
    for i in range(100):
        conversation = [
            {"role": "system", "content": df.iloc[i]['system']},
            {"role": "user", "content": [
                {"type": "audio", "audio_url": df.iloc[i]['sliced_audio_filename']},
            ]},
            {"role": "assistant", "content": df.iloc[i]['answer']},
        ]
        text = processor.apply_chat_template(conversation, tokenize=False)
        out.write({
            'text': text,
            'audio': df.iloc[i]['sliced_audio_filename'],
            'input_ids': np.array([]).astype(np.uint32),
            'position_ids': np.array([]).astype(np.uint32),
            'attention_mask': np.array([]).astype(np.uint32),
        })

In [13]:
def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):
    total_size = sum(mask.size(0) for mask in masks)
    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)

    current_pos = 0

    for mask in masks:
        size = mask.size(0)
        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
        current_pos += size

    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min
    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)
    return inverted_mask.unsqueeze(0)

In [14]:
dataset = LocalDataset('tokenized-4k/tokenized-0')
dataset[0]

{'attention_mask': array([358, 154], dtype=uint32),
 'audio': '',
 'input_ids': array([151644,   8948,    198,   2610,    525,    264,  10950,  17847,
            13, 151645,    198, 151644,    872,    198,   1514,  86461,
         25441,  64634,  10207,   3741,     72,    738,    287,   8212,
           220,     20,     15,  22962,     13,    422,   2780,   6616,
           277,  25441,  64634,     11,    274,   2367,  84168,   1833,
           261,   9198,     72,  41929,    351,   1103,  87418,  25441,
         64634,  31811,  30737,    332,    220,     21,     15,  15269,
         97396,     13,   8907,    391,  76427,  30737,    332,  72736,
         19262,  49607,  10207,   2712,  41702,  20414,  41929,    351,
          1103,  87418,  25441,  64634,  35691,     30, 151645,    198,
        151644,  77091,    198,  20250,   3101,  51518,   2780,  30737,
           332,    595,  11619,     70,   1103,  49607,  10207,   2712,
         41702,  20414,  41929,    351,   1103,  87418,  

In [15]:
dataset = LocalDataset('audio-out')
dataset[0]

{'attention_mask': array([], dtype=uint32),
 'audio': 'filter-gpt-omni-voiceassistant-400k/train-00111-of-00325-498.mp3',
 'input_ids': array([], dtype=uint32),
 'position_ids': array([], dtype=uint32),
 'text': "<|im_start|>system\nu are a voice assistant that u use voice to communicate, always reply in english<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\n<|im_end|>\n<|im_start|>assistant\nI'm really sorry, but I can't help with that. It's important to treat others with honesty and respect online, just like in real life. Catfishing can cause a lot of hurt and is unfair to the other person. If you're feeling lonely or want to connect with others, there are many healthy and genuine ways to do so. Let me know if you need suggestions!<|im_end|>\n"}

In [16]:
!rm -rf packing-4k

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
folders = ['audio-out', 'tokenized-4k/tokenized-0']
with MDSWriter(out='packing-4k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 10146.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 11550.61it/s]


In [18]:
class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = LocalDataset(local=local)
        self.audio = Audio(sampling_rate=16000)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        try:
            f = data['audio']
            if len(f):
                f = f.replace('output-audio/', 'filter-audio/')
                if not os.path.exists(f):
                    return None
                audio = self.audio.decode_example(
                    self.audio.encode_example(f))['array']

                inputs_audio = processor.feature_extractor([audio], return_attention_mask=True, padding="max_length", return_tensors = 'pt')
                audio_lengths = inputs_audio["attention_mask"].sum(-1).tolist()

                sample = data['text']
                num_audio_tokens = sample.count(audio_token)
                replace_str = []
                while audio_token in sample:
                    audio_length = audio_lengths.pop(0)
                    input_length = (audio_length - 1) // 2 + 1
                    num_audio_tokens = (input_length - 2) // 2 + 1

                    expanded_audio_token = audio_token * num_audio_tokens

                    audio_token_start_idx = sample.find(audio_token)
                    audio_token_end_idx = audio_token_start_idx + len(audio_token)

                    has_bos = (
                        sample[audio_token_start_idx - len(audio_bos_token) : audio_token_start_idx]
                        == audio_bos_token
                    )
                    has_eos = (
                        sample[audio_token_end_idx : audio_token_end_idx + len(audio_eos_token)]
                        == audio_eos_token
                    )

                    if not has_bos and not has_eos:
                        expanded_audio_token = audio_bos_token + expanded_audio_token + audio_eos_token

                    replace_str.append(expanded_audio_token)
                    sample = sample.replace(audio_token, "<placeholder>", 1)

                while "<placeholder>" in sample:
                    sample = sample.replace("<placeholder>", replace_str.pop(0), 1)

                inputs = {
                    'input_ids': sample,
                    'input_features': inputs_audio['input_features'],
                    'feature_attention_mask': inputs_audio['attention_mask'],
                }
                return inputs
            else:
                data.pop('text', None)
                data.pop('audio')
                data['labels'] = data["input_ids"].copy()
                masking = data.pop('attention_mask')

                data.pop('token_type_ids', None)
                for k in data.keys():
                    data[k] = torch.tensor(data[k].astype(np.int64))

                masks = []
                for m in masking:
                    masks.append(torch.tril(torch.ones(m, m)))
                attention_mask = block_diagonal_concat_inverted(*masks)
                data['attention_mask'] = attention_mask
                return data

        except Exception as e:
            print(e)
            return None

    def __len__(self):
        return len(self.dataset)

In [19]:
dataset = DatasetFixed('packing-4k')

In [20]:
dataset[0]

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'input_ids': "<|im_start|>system\nu are a voice assistant that u use voice to communicate, always reply in english<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUD

In [21]:
dataset[-1]

{'input_ids': tensor([  8656,  11814,   9810,   3362,    276,    452,  86415,   1728,  10371,
          49935,    272,   1225,    391,   9101,  58554,     74,   6053,     11,
          92619,  67431,  97934,    276,   2953,    709,     74,   2154,  78549,
          10215,   1346,   1574,     75,  43518,  10582,   1330,   1579,   9101,
          14885,  52067,  50977,    278,  22051,    382,    818,   2267,    595,
          12800,    355,    452,  86415,   1728,    595,  27574,    372,    293,
           9247,  78351,   1207,   5049,   4647,    300,   3101,   1447,     16,
             13,   1207,   3994,    293,   9247,  78351,   1207,   5049,     25,
          19739,    266,   1579,  11806,   1276,    595,  27574,    372,    708,
          73790,  13207,   4554,  29180,   3101,  19593,  10581,   9810,    276,
          19262,   1997,  10215,   1207,   3994,  31811,   1962,     74,  13229,
           2804,  59220,     13,  73767,    708,  73790,  21249,    391,   2143,
          20414

In [22]:
def collator(batch):
    batch = [b for b in batch if b is not None] 
    input_ids, attention_mask, position_ids, labels = [], [], [], []
    input_features, feature_attention_mask = [], []

    for b in batch:
        if 'input_features' in b:
            input_ids_ = processor.tokenizer(
                [b['input_ids']], 
                return_tensors = 'pt', 
                truncation = True, 
                max_length = sequence_length,
                padding = 'max_length',
            )
            attention_mask_ = input_ids_['attention_mask']
            labels_ = input_ids_['input_ids'].clone()
            labels_[labels_ == audio_token_id] = -100
            labels_[labels_ == pad_token_id] = -100
            input_ids.append(input_ids_['input_ids'])
            labels.append(labels_)
            cache_position = torch.arange(0, input_ids_['input_ids'].shape[1])
            causal_mask = torch.full(
                (sequence_length, sequence_length), fill_value=min_dtype, dtype=torch_dtype
            )
            causal_mask *= torch.arange(sequence_length) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :]
            causal_mask = causal_mask.clone()
            mask_length = attention_mask_.shape[-1]
            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask_[:, None, None, :]
            padding_mask = padding_mask == 0
            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                padding_mask, min_dtype
            )
            attention_mask.append(causal_mask)
            position_ids.append(cache_position[None])
            input_features.append(b['input_features'])
            feature_attention_mask.append(b['feature_attention_mask'])
        else:
            input_ids.append(b['input_ids'][None])
            attention_mask.append(b['attention_mask'][None])
            position_ids.append(b['position_ids'][None])
            labels.append(b['labels'][None])

    input_ids = {
        'input_ids': torch.concat(input_ids, 0),
        'attention_mask': torch.concat(attention_mask, 0),
        'position_ids': torch.concat(position_ids, 0),
        'labels': torch.concat(labels, 0),
    }
    if len(input_features):
        input_ids['input_features'] = torch.concat(input_features, 0)
        input_ids['feature_attention_mask'] = torch.concat(feature_attention_mask, 0)

    return input_ids

In [23]:
class Model(Qwen2AudioForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        
    def forward(self, input_ids, attention_mask, position_ids, input_features, feature_attention_mask, labels = None, **kwargs):
        super_out = super().forward(
            input_ids = input_ids, 
            attention_mask = attention_mask, 
            position_ids = position_ids,
            input_features = input_features, 
            feature_attention_mask = feature_attention_mask,
            
        )
        if labels is not None:
            logits = super_out.logits
            vocab_size = logits.shape[-1]
            logits = logits.float()
            labels = labels.to(logits.device)
            labels = nn.functional.pad(labels, (0, 1), value=-100)
            shift_labels = labels[..., 1:].contiguous()
            logits = logits.view(-1, vocab_size)
            shift_labels = shift_labels.view(-1)
            shift_labels = shift_labels.to(logits.device)
            loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=-100, reduction='mean')
            return {'loss': loss}

In [24]:
model = Model.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct", torch_dtype = torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [25]:
selected = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"]

In [26]:
target_modules = []
for id, (name, param) in enumerate(model.named_modules()):
    if 'language_model.model' in name and any([s in name for s in selected]):
        target_modules.append(name)
if 'lm_head' in selected:
    target_modules.append('language_model.lm_head')

In [27]:
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

In [28]:
model = get_peft_model(model, peft_config)

In [29]:
_ = model.cuda()

In [30]:
with torch.no_grad():
    batch = [dataset[1], dataset[0]]
    o = collator(batch)
    for k in o.keys():
        o[k] = o[k].cuda()
    o = model(**o)

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [31]:
o

{'loss': tensor(5.0512, device='cuda:0')}