In [2]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import MDSWriter, LocalDataset
from tqdm import tqdm
from typing import List
import torch
import json

class ListOfDict(Encoding):
    def encode(self, obj: List[dict]) -> bytes:
        # Convert the list of dictionaries to a JSON-encoded string
        json_str = json.dumps(obj)
        return json_str.encode('utf-8')

    def decode(self, data: bytes) -> List[dict]:

        # Decode the JSON-encoded string back to a list of dictionaries
        json_str = data.decode('utf-8')
        return json.loads(json_str)

# Register the custom encoding for 'list_of_dict'
_encodings['list_of_dict'] = ListOfDict

In [3]:
from glob import glob

files = glob('prepared-llava*.jsonl')
files.extend(glob('prepared-audio*.jsonl'))
files.extend(glob('prepared-relationship*.jsonl'))
files.extend(glob('prepared-malay*'))

files

['prepared-llava-en.jsonl',
 'prepared-llava-ms.jsonl',
 'prepared-audio-en.jsonl',
 'prepared-audio-ms.jsonl',
 'prepared-relationship-en.jsonl',
 'prepared-relationship-ms.jsonl',
 'prepared-malay.jsonl']

In [7]:
for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                l = json.loads(l)
                if len(l['filename']) > 2:
                    print(l)
            except:
                pass

148657it [00:01, 136536.32it/s]
148657it [00:01, 131254.08it/s]
293752it [00:03, 76067.92it/s]
293752it [00:03, 76063.72it/s]
184500it [00:00, 219085.72it/s]
184500it [00:00, 220085.90it/s]
3238it [00:00, 199775.77it/s]


In [8]:
columns = {
    'conversations': 'list_of_dict',
    'filename': 'list_of_dict'
}

hashes = 'sha1', 'xxh64'

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')
tokenizer.add_tokens(["<image>", "</image>", "<audio>", "</audio>"])

4

In [13]:
tokenizer('a', return_tensors='pt')['input_ids'][0]

tensor([[263]])

In [5]:
import random

In [28]:
!rm -rf mosaic-multimodal

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
with MDSWriter(out='mosaic-multimodal', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                try:
                    l = json.loads(l)
                    for i in range(len(l['filename'])):
                        l['filename'][i] = l['filename'][i].replace('/output-audio', '/filter-audio')
                    for i in range(len(l['conversations'])):
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('\n<image>', ' <image>').replace('<image>\n', '<image> ').replace('\n<audio>', ' <audio>').replace('<audio>\n', '<audio> ').strip()
                        l['conversations'][i]['content'] = l['conversations'][i]['content'].replace('<image>', '<image> </image>').replace('<audio>', '<audio> </audio>')
                    
                    try:
                        a = tokenizer.apply_chat_template(l['conversations'], tokenize = False)
                    except Exception as e:
                        continue
                        
                    a = tokenizer(a, return_tensors = 'np')['input_ids'][0]
                    
                    if len(a[(a == 32000) | (a == 32002)]) != len(l['filename']):
                        print(l)
                        continue
                        
                    if len(a[(a == 32001) | (a == 32003)]) != len(l['filename']):
                        print(l)
                        continue
                        
                    if 'malay' not in f and random.random() > 0.5:
                        continue
                        
                    out.write(l)
                except Exception as e:
                    print(l, e)

148657it [01:47, 1377.63it/s]
148657it [02:00, 1236.91it/s]
293752it [08:07, 602.88it/s]
293752it [09:05, 538.53it/s]
184500it [01:10, 2614.99it/s]
184500it [01:10, 2602.95it/s]
3238it [00:01, 2650.20it/s]


In [10]:
dataset = LocalDataset('mosaic-multimodal')
len(dataset)

630301

In [30]:
dataset = LocalDataset('mosaic-multimodal')
len(dataset)

629476

In [14]:
dataset[-10000]

{'conversations': [{'role': 'user',
   'content': '<image> </image> <image> </image> What is related between picture 1 and picture 2'},
  {'role': 'assistant',
   'content': 'There is no direct relation between Picture 1: the NCAA tournament bracket for March 29, and Picture 2: the Murray River. The first picture is a diagram of the NCAA basketball tournament bracket, while the second picture is a photograph of the Murray River. They are unrelated.'}],
 'filename': ['/home/ubuntu/filtered-blip-images/00398/003989575.jpg',
  '/home/ubuntu/filtered-blip-images/00043/000438421.jpg']}