In [1]:
import json
from pprint import pprint
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch
import random

In [2]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
model = AutoModelForCausalLM.from_pretrained(
    'mistralai/Mistral-7B-Instruct-v0.2',
    use_flash_attention_2 = True,
    torch_dtype = torch.float16
)
tokenizer.pad_token = tokenizer.eos_token
model.cuda()

### Audio

In [3]:
data = []

with open('mixtral-audio-instruction.jsonl') as fopen:
    for x in fopen:
        data.append(json.loads(x))

In [4]:
unique_conversations = []
conversation_set = set()

for data_ in data:

    conversations = data_['context']
    if conversations not in conversation_set:
        unique_conversations.append(data_)
        conversation_set.add(conversations)
        
len(unique_conversations)

59404

### Image

In [5]:
image = []

with open('dataset/llava/blip_laion_cc_sbu_558k.translated.jsonl') as fopen:
    for x in fopen:
        image.append(json.loads(x))
        
len(image)

558128

In [6]:
image[0]

{'id': '004539375',
 'image': '00453/004539375.jpg',
 'conversations': [{'from': 'human',
   'value': 'Render a clear and concise summary of the photo.\n<image>',
   'value_ms': 'Render ringkasan foto yang jelas dan ringkas.\n<imej>'},
  {'from': 'gpt',
   'value': 'select luxury furniture 3 - inch gel memory foam mattress topper',
   'value_ms': 'pilih perabot mewah 3 inci memori gel buih tilam topper'}]}

In [20]:
batch_size = 100
prompt_list = []
selected_audio = set()
selected_image = set()

with open('llava-audio-chat-template.jsonl', 'w') as fopen:
    for x in tqdm(range(0, 100000, batch_size)):
        
        audios = []
        images = []
        prompts = []
        for _ in range(batch_size):
            audio_id = random.choice(list(set(range(len(unique_conversations))) - selected_audio))
            image_id =random.choice(list(set(range(len(image))) - selected_image))

            selected_audio.add(audio_id)
            selected_image.add(image_id)

            audio_context = unique_conversations[audio_id]['context']
            image_context = image[image_id]['conversations'][1]['value']


            messages = [{'role': 'user',
                              'content': f"""
            Audio 1: {audio_context} 
            Picture 1: {image_context}
            What is related between audio 1 and picture 1."""}]

            prompt = tokenizer.apply_chat_template(messages, tokenize = False)
            prompts.append(prompt)
            audios.append((unique_conversations[audio_id]['context'], unique_conversations[audio_id]['filename']))
            images.append((image[image_id]['conversations'][1]['value'], image[image_id]['image']))
            
            
        d = {
            'prompts': prompts,
            'audios': audios,
            'images': images,
        }
        fopen.write(f'{json.dumps(d)}\n')