In [1]:
import json
import os
import random
from glob import glob
from tqdm import tqdm

In [2]:
files = glob('translated-LLaVA-Instruct-150K/*.jsonl')
files

['translated-LLaVA-Instruct-150K/detail_23k.translated.jsonl',
 'translated-LLaVA-Instruct-150K/conversation_58k.translated.jsonl',
 'translated-LLaVA-Instruct-150K/complex_reasoning_77k.translated.jsonl']

In [3]:
roles = {
    'human': 'user',
    'gpt': 'assistant'
}

In [4]:
data_en, data_ms = [], []
for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            f = os.path.join('/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images', l['image'])
            if not os.path.exists(f):
                continue
            en, ms = [], []
            for no, c in enumerate(l['conversations']):
                en_ = c['value'].replace('<imej>','<image>').replace('<img>','<image>').replace('<gambar>','<image>')
                ms_ = c['value_ms'].replace('<imej>','<image>').replace('<img>','<image>').replace('<gambar>','<image>')
                
                if no == 0 and roles[c['from']] == 'user' and ('<image>' not in en_ or '<image>' not in ms_):
                    break
                    
                en.append({
                    'role': roles[c['from']],
                    'content': en_
                })
                ms.append({
                    'role': roles[c['from']],
                    'content': ms_
                })
            
            if len(en):
                data_en.append({'filename': [f], 'conversations': en})
            if len(ms):
                data_ms.append({'filename': [f], 'conversations': ms})

23240it [00:00, 43904.40it/s]
56681it [00:02, 20125.61it/s]
76643it [00:02, 38155.83it/s]


In [8]:
data_en[-100000]

{'filename': ['/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images/000000532583.jpg'],
 'conversations': [{'role': 'user',
   'content': '<image>\nWhat is the primary action happening in the image, and who are the central figures?'},
  {'role': 'assistant',
   'content': 'The primary action happening in the image is a baseball game in play. The central figures include a batter who just hit the ball and is preparing to run, a catcher, and an umpire. There are also fans in the background watching the game.'},
  {'role': 'user', 'content': "What is the color of the batter's outfit?"},
  {'role': 'assistant',
   'content': 'The batter is wearing white pants and a blue shirt.'},
  {'role': 'user',
   'content': 'Is the baseball player holding a bat or running on the field after hitting the ball?'},
  {'role': 'assistant',
   'content': 'The baseball player has just hit the ball and is going to run, while still holding the bat.'},
  {'role': 'user', 'content': 'What is the role

In [9]:
data_ms[-100000]

{'filename': ['/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images/000000532583.jpg'],
 'conversations': [{'role': 'user',
   'content': '<image>\nApakah tindakan utama yang berlaku dalam imej, dan siapakah tokoh utama?'},
  {'role': 'assistant',
   'content': 'Tindakan utama yang berlaku dalam imej adalah permainan besbol dalam permainan. Angka-angka utama termasuk pemukul yang baru memukul bola dan sedang bersedia untuk berlari, penangkap dan pengadil. Terdapat juga peminat di latar belakang menonton permainan.'},
  {'role': 'user', 'content': 'Apakah warna pakaian adunan?'},
  {'role': 'assistant',
   'content': 'Pukulan itu memakai seluar putih dan baju biru.'},
  {'role': 'user',
   'content': 'Adakah pemain besbol memegang kelawar atau berlari di atas padang selepas memukul bola?'},
  {'role': 'assistant',
   'content': 'Pemain besbol baru saja memukul bola dan akan berlari, sambil masih memegang pemukul.'},
  {'role': 'user', 'content': 'Apakah peranan pengadil dal

In [10]:
a = set()
for l in data_en + data_ms:
    for i in range(len(l['conversations'])):
        splitted = l['conversations'][i]['content'].split('<')
        if len(splitted) > 1:
            a.add(splitted[1].strip().split('\n')[0])
a

{'image>'}

In [11]:
len(data_en), len(data_ms)

(148657, 148657)

In [12]:
data_ms[2]

{'filename': ['/home/ubuntu/translated-LLaVA-Instruct-150K/filtered-llava-images/000000281721.jpg'],
 'conversations': [{'role': 'user',
   'content': 'Terangkan imej berikut.\n<image>'},
  {'role': 'assistant',
   'content': 'Imej itu memaparkan jalan bandar yang sibuk dengan kereta yang memandu di jalan raya dan pejalan kaki berjalan di sepanjang kaki lima. Pelbagai kenderaan melalui lampu hijau di persimpangan, bergerak ke kedua-dua arah. Sesetengah kereta lebih dekat dengan kamera, manakala yang lain boleh dilihat lebih jauh di jalan.\n\nTerdapat beberapa pejalan kaki yang kelihatan berhampiran bahagian kiri imej, mungkin menavigasi jalan mereka melalui bandar. Pelbagai lampu isyarat diletakkan di seluruh tempat kejadian, memastikan aliran trafik yang selamat untuk kedua-dua kenderaan dan pejalan kaki. Suasana menunjukkan persekitaran bandar yang sibuk dan meriah.'}]}

In [None]:
with open('prepared-llava-en.jsonl', 'w') as fopen:
    for d in data_en:
        fopen.write(f'{json.dumps(d)}\n')

In [None]:
with open('prepared-llava-ms.jsonl', 'w') as fopen:
    for d in data_ms:
        fopen.write(f'{json.dumps(d)}\n')