In [1]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [5]:
from glob import glob

files = glob('prepared-llava*.jsonl')
files.extend(glob('prepared-combine*.jsonl'))
files

['prepared-llava-en.jsonl',
 'prepared-llava-ms.jsonl',
 'prepared-combine-ms.jsonl',
 'prepared-combine-en.jsonl']

In [6]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [11]:
!rm -rf mosaic-multimodal-vision

In [12]:
with MDSWriter(out='mosaic-multimodal-vision', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                try:
                    l = json.loads(l)
                    for i in range(len(l['filename'])):
                        l['filename'][i] = l['filename'][i].replace('/output-audio', '/filter-audio')
                    for i in range(len(l['conversations'])):
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('\n<image>', ' <image>').replace('<image>\n', '<image>').replace('\n<audio>', ' <audio>').replace('<audio>\n', '<audio>').strip()
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('<image>', '<image> </image>').replace('<audio>', '<audio> </audio>')
                    out.write(l)
                except Exception as e:
                    print(l, e)

156564it [00:05, 28894.29it/s]
156564it [00:05, 27681.14it/s]
2it [00:00, 16163.02it/s]


{'role': 'user', 'content': 'Provide a brief description of the given image. <image> </image>'} 'filename'
{'role': 'assistant', 'content': 'us03498792 figure 1 from a patent drawing of a battery system'} 'filename'


2it [00:00, 6109.69it/s]


{'role': 'user', 'content': 'Provide a brief description of the given image. <image> </image>'} 'filename'
{'role': 'assistant', 'content': 'us03498792 figure 1 from a patent drawing of a battery system'} 'filename'


In [13]:
dataset = LocalDataset('mosaic-multimodal-vision')

In [14]:
dataset[0]

{'conversations': [{'role': 'user',
   'content': 'What do you see happening in this image? <image> </image>'},
  {'role': 'assistant',
   'content': 'The scene depicts a lively plaza area with several people walking and enjoying their time. A man is standing in the plaza with his legs crossed, holding a kite in his hand. The kite has multiple sections attached to it, spread out in various directions as if ready for flight.\n\nNumerous people are scattered throughout the plaza, walking and interacting with others. Some of these individuals are carrying handbags, and others have backpacks. The image captures the casual, social atmosphere of a bustling plaza on a nice day.'}],
 'filename': ['/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images/000000442786.jpg']}

In [23]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

In [26]:
tokenizer.apply_chat_template

<bound method PreTrainedTokenizerBase.apply_chat_template of LlamaTokenizerFast(name_or_path='mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}>