In [3]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [4]:
from glob import glob

files = glob('prepared-llava*.jsonl')
files.extend(glob('prepared-audio*.jsonl'))
files.extend(glob('prepared-relationship*.jsonl'))
files.extend(glob('prepared-malay*'))

files

['prepared-llava-en.jsonl',
 'prepared-llava-ms.jsonl',
 'prepared-audio-en.jsonl',
 'prepared-audio-ms.jsonl',
 'prepared-relationship-en.jsonl',
 'prepared-relationship-ms.jsonl',
 'prepared-malay.jsonl']

In [5]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [23]:
!rm -rf mosaic-multimodal

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')
tokenizer.add_tokens(["<image>", "</image>", "<audio>", "</audio>"])

4

In [24]:
import random

In [25]:
with MDSWriter(out='mosaic-multimodal', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                try:
                    l = json.loads(l)
                    for i in range(len(l['filename'])):
                        l['filename'][i] = l['filename'][i].replace('/output-audio', '/filter-audio')
                    for i in range(len(l['conversations'])):
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('\n<image>', ' <image>').replace('<image>\n', '<image> ').replace('\n<audio>', ' <audio>').replace('<audio>\n', '<audio> ').strip()
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('<image>', '<image> </image>').replace('<audio>', '<audio> </audio>')
                    
                    if 'malay' not in f and random.random() > 0.4:
                        continue
                    out.write(l)
                except Exception as e:
                    print(l, e)

148657it [00:03, 41097.50it/s]
148657it [00:03, 39260.70it/s]
293752it [00:11, 26198.48it/s]
293752it [00:12, 24097.69it/s]
184500it [00:02, 66309.04it/s]
184500it [00:02, 65428.63it/s]
3238it [00:00, 43098.23it/s]


In [26]:
dataset = LocalDataset('mosaic-multimodal')
len(dataset)

503704

In [30]:
dataset[100001]

{'conversations': [{'role': 'user',
   'content': 'Apa yang mungkin menjadi sebab di sebalik wanita yang memegang payung? <image> </image>'},
  {'role': 'assistant',
   'content': 'Wanita dalam imej itu mungkin memegang payung biru terbuka di atas kepalanya untuk melindungi dirinya daripada hujan atau matahari. Memandangkan payung terbuka, kemungkinan besar dia sedang mengalami atau menjangkakan keadaan cuaca sedemikian. Walaupun ia juga boleh menjadi kenyataan fesyen kerana warna dan reka bentuk yang menarik, tujuan utamanya adalah untuk melindunginya daripada unsur-unsur, memastikan dia kekal selesa dan rambutnya, dihiasi dengan bunga putih yang cantik, kekal utuh tanpa mengira cuaca.'}],
 'filename': ['/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images/000000278174.jpg']}