In [1]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [2]:
from glob import glob

files = glob('prepared*.jsonl')
files

['prepared-combine-ms.jsonl',
 'prepared-combine-en.jsonl',
 'prepared-llava-en.jsonl',
 'prepared-llava-ms.jsonl',
 'prepared-audio-en.jsonl',
 'prepared-malay.jsonl',
 'prepared-audio-ms.jsonl']

In [3]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [11]:
!rm -rf mosaic-multimodal

In [12]:
with MDSWriter(out='mosaic-multimodal-audio', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                l = json.loads(l)
                for i in range(len(l['filename'])):
                    l['filename'][i] = l['filename'][i].replace('/output-audio', '/filter-audio')
                out.write(l)

44669it [00:01, 28401.17it/s]
156564it [00:03, 48070.91it/s]


In [13]:
!head -n 1 prepared-llava-en.jsonl

{"filename": ["/home/ubuntu/filtered-llava-images/000000033471.jpg"], "conversations": [{"role": "user", "content": "What are the colors of the bus in the image?\n<image>"}, {"role": "assistant", "content": "The bus in the image is white and red."}, {"role": "user", "content": "What feature can be seen on the back of the bus?"}, {"role": "assistant", "content": "The back of the bus features an advertisement."}, {"role": "user", "content": "Is the bus driving down the street or pulled off to the side?"}, {"role": "assistant", "content": "The bus is driving down the street, which is crowded with people and other vehicles."}]}
