In [1]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [2]:
from glob import glob

files = glob('prepared-llava*.jsonl')
files.extend(glob('prepared-combine*.jsonl'))
files.extend(glob('prepared-relationship*.jsonl'))
files.extend(glob('prepared-malay*'))
files

['prepared-llava-en.jsonl',
 'prepared-llava-ms.jsonl',
 'prepared-combine-ms.jsonl',
 'prepared-combine-en.jsonl',
 'prepared-relationship-en.jsonl',
 'prepared-relationship-ms.jsonl',
 'prepared-malay.jsonl']

In [3]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [4]:
!rm -rf mosaic-multimodal-vision

In [5]:
with MDSWriter(out='mosaic-multimodal-vision', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                try:
                    l = json.loads(l)
                    rejected = False
                    for i in range(len(l['filename'])):
                        l['filename'][i] = l['filename'][i].replace('/output-audio', '/filter-audio')
                    for i in range(len(l['conversations'])):
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('\n<image>', ' <image>').replace('<image>\n', '<image> ').replace('\n<audio>', ' <audio>').replace('<audio>\n', '<audio> ').strip()
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('<image>', '<image> </image>').replace('<audio>', '<audio> </audio>')
                        if i == 0 and l['conversations'][i]['role'] == 'user' and '<image> </image>' not in l['conversations'][i]['content']:
                            print(l)
                            rejected = True
                            break
                    if rejected:
                        continue
                    out.write(l)
                except Exception as e:
                    print(l, e)

148657it [00:05, 29286.02it/s]
148657it [00:05, 26959.38it/s]
20000it [00:01, 14236.31it/s]
20000it [00:01, 14740.18it/s]
100000it [00:02, 44768.93it/s]
100000it [00:02, 43693.78it/s]
3238it [00:00, 44624.21it/s]


In [6]:
dataset = LocalDataset('mosaic-multimodal-vision')
len(dataset)

540552

In [10]:
dataset[-3]

{'conversations': [{'role': 'user',
   'content': '<image> </image>Ini makanan apa?'},
  {'role': 'assistant',
   'content': 'Ini ialah Waffle Rangup & Lembut Gebu, Kalah Kedai Punya!.'},
  {'role': 'user', 'content': 'apakah bahan yang diperlukan untuk masak?'},
  {'role': 'assistant',
   'content': '1. Bahan-bahan waffle:\n2. 2 cawan susu krim penuh\n3. 2 sudu besar jus lemon\n4. 2/3 cawan minyak jagung (boleh kurangkan kpd 1/2 cawan jika suka)\n5. 2 biji telur\n6. 2 1/4 cawan tepung gandum\n7. 1/2 cawan tepung jagung\n8. 2 sudu besar gula halus\n9. 1 sudu kecil serbuk penaik\n10. 1/2 sudu kecil soda bikarbonat\n11. 1/2 sudu kecil garam\n12. 1 1/2 sudu kecil esen vanila\n13. Bahan-bahan sos coklat:\n14. 1 cawan kepingan coklat\n15. 1 sudu besar mentega\n16. 1/4 cawan susu krim penuh'},
  {'role': 'user', 'content': 'macam mana nak masak?'},
  {'role': 'assistant',
   'content': '1. Cara-cara untuk adunan waffle:\n2. Perah dan ambil jus lemon. Masukkan jus lemon dan susu full cream ke