In [1]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [2]:
# !wget https://huggingface.co/datasets/mesolitica/translated-LLaVA-Pretrain/resolve/main/blip_laion_cc_sbu_558k.translated.jsonl

In [3]:
# !wget https://huggingface.co/datasets/mesolitica/translated-LLaVA-Pretrain/resolve/main/filtered-blip-images.7z.001
# !wget https://huggingface.co/datasets/mesolitica/translated-LLaVA-Pretrain/resolve/main/filtered-blip-images.7z.002
# !wget https://huggingface.co/datasets/mesolitica/translated-LLaVA-Pretrain/resolve/main/filtered-blip-images.7z.003

In [4]:
# !~/7zz x filtered-blip-images.7z.001

In [5]:
import json
import os

In [6]:
roles = {
    'human': 'user',
    'gpt': 'assistant'
}

In [12]:
data_en, data_ms = [], []
with open('blip_laion_cc_sbu_558k.translated.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        f = os.path.join('/home/ubuntu/filtered-blip-images', l['image'])
        if not os.path.exists(f):
            continue
        en, ms = [], []
        for c in l['conversations']:
            en_ = c['value'].replace('<imej>','<image>').replace('<img>','<image>').replace('<gambar>','<image>')
            ms_ = c['value_ms'].replace('<imej>','<image>').replace('<img>','<image>').replace('<gambar>','<image>')
            
            if '<image>' not in en_ or '<image>' not in ms_:
                continue
                
            en.append({
                'role': roles[c['from']],
                'content': en_
            })
            ms.append({
                'role': roles[c['from']],
                'content': ms_
            })
        if len(en):
            data_en.append({'filename': [f], 'conversations': en})
        if len(ms):
            data_ms.append({'filename': [f], 'conversations': ms})

558128it [00:07, 74695.53it/s]


In [8]:
len(data_en), len(data_ms)

(238135, 238135)

In [13]:
len(data_en), len(data_ms)

(238135, 238135)

In [14]:
a = set()
for l in data_en + data_ms:
    for i in range(len(l['conversations'])):
        splitted = l['conversations'][i]['content'].split('<')
        if len(splitted) > 1:
            a.add(splitted[1].strip().split('\n')[0])
a

{'image>'}

In [16]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [17]:
!rm -rf mosaic-vision

In [18]:
data = data_en + data_ms

In [19]:
with MDSWriter(out='mosaic-vision', columns=columns, compression=None, hashes=hashes) as out:
    for l in tqdm(data):
        for i in range(len(l['conversations'])):
            l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('\n<image>', ' <image>').replace('<image>\n', '<image>').replace('\n<audio>', ' <audio>').replace('<audio>\n', '<audio>').strip()
            l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('<image>', '<image> </image>').replace('<audio>', '<audio> </audio>')
        out.write(l)

100%|██████████| 476270/476270 [00:06<00:00, 74200.16it/s]


In [20]:
with open('prepared-combine-en.jsonl', 'w') as fopen:
    for d in en:
        fopen.write(f'{json.dumps(d)}\n')

In [21]:
with open('prepared-combine-ms.jsonl', 'w') as fopen:
    for d in en:
        fopen.write(f'{json.dumps(d)}\n')