In [1]:
import torch
import json
from transformers import AutoProcessor, AutoTokenizer
from datasets import Audio
from PIL import Image
from collections.abc import Mapping
from streaming import MDSWriter, LocalDataset
from streaming.base.format.mds.encodings import Encoding, _encodings
from typing import List

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:
        json_str = data.decode('utf-8')
        return json.loads(json_str)

_encodings['list_of_dict'] = ListOfDict

In [21]:
class MMDataset(torch.utils.data.Dataset):

    def __init__(self, folder):
        if folder.endswith('.json'):
            with open(folder) as fopen:
                self.dataset = json.load(fopen)
        elif folder.endswith('.jsonl'):
            self.dataset = []
            with open(folder) as fopen:
                for l in fopen:
                    self.dataset.append(json.loads(l))
        else:
            self.dataset = LocalDataset(folder)
        
        self.image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-224')
        self.audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
        self.tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')
        
        self.tokenizer.pad_token = self.tokenizer.unk_token
        self.tokenizer.add_bos_token = False
        self.tokenizer.add_eos_token = False
        self.tokenizer.padding_side = "right"
        self.tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
        self.sr = 16000
        self.audio = Audio(sampling_rate=self.sr)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        audio_list = []
        image_list = []

        for x in data['filename']:
            if x.endswith('.mp3'):
                audio = self.audio.decode_example(self.audio.encode_example(x))['array']
                print(audio.shape)

                audio_features = self.audio_processor(audio, sampling_rate=self.sr, return_tensors='pt')
                
                audio_list.append(audio_features['input_features']) 

            elif x.endswith('.jpg'):
                image = Image.open(x)
                print(image.shape)

                image_output = self.image_processor(images=image, return_tensors='pt')['pixel_values']

                image_list.append(image_output)

        full_text = self.tokenizer.apply_chat_template(data['conversations'], tokenize=False) # Assuming preprocessor_new is defined elsewhere

        outputs = self.tokenizer(full_text, return_tensors='pt',truncation=True,max_length=4096,return_overflowing_tokens=False,return_length=False)

        outputs['audios'] = torch.cat(audio_list, dim=0) if audio_list else None
        outputs['images'] = torch.cat(image_list, dim=0) if image_list else None

        return outputs

    def __len__(self):
        return len(self.dataset)

In [22]:
dataset = MMDataset('mosaic-multimodal')
len(dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


823859

In [4]:
class DataCollator():

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([], dtype=torch.int)
        batch['image_index'] = torch.tensor([], dtype=torch.int)

        for index, feature in enumerate(features):
            local_index = index % (bs // torch.cuda.device_count()) if bs > 1 else index % (bs)
            if feature['audios'] is not None:
                batch['audio_index'] = torch.cat([batch['audio_index'], torch.tensor(
                    [local_index] * len(feature['audios']), dtype=torch.int)])

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor(
                    [local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in (
                    "audios",
                    "images",
                    "input_ids",
                    "attention_mask"
            ) and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios", "images"):
                if v is None:
                    batch[k] = None
                else:
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])
                    
        input_ids = [{'input_ids': f['input_ids'][0]} for f in features]
        input_ids = self.tokenizer.pad(input_ids)
        batch['input_ids'] = input_ids['input_ids']
        batch['attention_mask'] = input_ids['attention_mask']
        batch['labels'] = input_ids['input_ids'].clone()
        batch['labels'][batch['labels'] == 0] = -100

        batch['image_starts'] = torch.tensor(
            [self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor(
            [self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)
        batch['audio_starts'] = torch.tensor(
            [self.tokenizer.convert_tokens_to_ids('<audio>')] * bs, dtype=torch.int)
        batch['audio_ends'] = torch.tensor(
            [self.tokenizer.convert_tokens_to_ids('</audio>')] * bs, dtype=torch.int)

        return batch

In [65]:
collator = DataCollator(dataset.tokenizer)

In [66]:
b = collator([dataset[i] for i in range(10)])

In [67]:
b

{'audio_index': tensor([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9],
        dtype=torch.int32),
 'image_index': tensor([], dtype=torch.int32),
 'audios': tensor([[[-0.1519, -0.3853, -0.3853,  ..., -0.3260, -0.0303, -0.1203],
          [-0.0874, -0.3744, -0.3799,  ..., -0.3853,  0.1363,  0.1269],
          [ 0.0319, -0.1669, -0.1714,  ..., -0.0601,  0.2068,  0.3885],
          ...,
          [-0.3853, -0.3853, -0.3853,  ..., -0.2202, -0.1848, -0.1255],
          [-0.3853, -0.3853, -0.3853,  ..., -0.3853, -0.3853, -0.3520],
          [-0.3853, -0.3853, -0.3853,  ..., -0.3853, -0.3853, -0.3853]],
 
         [[ 0.6540,  0.6983,  0.6168,  ...,  0.0700,  0.6128,  0.6401],
          [ 0.4211,  0.4417,  0.3808,  ...,  0.7511,  0.8172,  0.8000],
          [ 0.2938,  0.3439,  0.2266,  ...,  0.9006,  1.0369,  1.0333],
          ...,
          [-0.3682, -0.4997, -0.5829,  ..., -0.1041, -0.1240, -0.2027],
          [-0.2645, -0.6108, -0.6108,  ..., -0.2516, -0.2545, -0.5817],
     

In [35]:
dataset.tokenizer.decode(b['input_ids'][0])

'<s> [INST] <audio><audio> What is related between audio 1 and audio 2 [/INST]Audio 1 and Audio 2 are unrelated as they discuss different topics. In Audio 1, the speaker is discussing the issue of using handphones while driving and its contribution to accidents. In Audio 2, the speaker is talking about making a cover song for Raya and the challenges they faced in creating the image for the video.</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><u

In [36]:
dataset.tokenizer.decode(b['input_ids'][1])

'<s> [INST] <audio><audio> What is related between audio 1 and audio 2 [/INST]Audio 1 and Audio 2 are not related. Audio 1 is about a person expressing their interest in making a drama film in the movie industry and discussing the challenges that come with becoming a role model. Audio 2 is about a person discussing their preference for certain flavors of pot and explaining how they plan to try different varieties.</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk