In [10]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch
import random

In [11]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2',torch_dtype = torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
model.cuda()

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacty of 79.15 GiB of which 156.56 MiB is free. Process 50730 has 59.36 GiB memory in use. Process 442461 has 14.76 GiB memory in use. Process 760118 has 4.84 GiB memory in use. Of the allocated memory 14.00 GiB is allocated by PyTorch, and 269.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [4]:
data = []

with open('dataset/blip_laion_cc_sbu_558k.translated.jsonl') as fopen:
    
    for x in fopen:
        data.append(json.loads(x))

In [5]:
data[0:5]

[{'id': '004539375',
  'image': '00453/004539375.jpg',
  'conversations': [{'from': 'human',
    'value': 'Render a clear and concise summary of the photo.\n<image>',
    'value_ms': 'Render ringkasan foto yang jelas dan ringkas.\n<imej>'},
   {'from': 'gpt',
    'value': 'select luxury furniture 3 - inch gel memory foam mattress topper',
    'value_ms': 'pilih perabot mewah 3 inci memori gel buih tilam topper'}]},
 {'id': '002239345',
  'image': '00223/002239345.jpg',
  'conversations': [{'from': 'human',
    'value': 'Write a terse but informative summary of the picture.\n<image>',
    'value_ms': 'Tulis ringkasan ringkas tetapi bermaklumat tentang gambar.\n<imej>'},
   {'from': 'gpt',
    'value': 'a grey watch with an army style strap',
    'value_ms': 'jam tangan kelabu dengan tali gaya tentera'}]},
 {'id': '005947502',
  'image': '00594/005947502.jpg',
  'conversations': [{'from': 'human',
    'value': '<image>\nWhat is this?',
    'value_ms': '<imej>\nApa ini?'},
   {'from': 'gp

In [6]:
def parse_mistral_chat(messages, function_call = None):

    user_query = messages[-1]['content']

    users, assistants = [], []
    for q in messages[:-1]:
        if q['role'] == 'user':
            users.append(q['content'])
        elif q['role'] == 'assistant':
            assistants.append(q['content'])

    texts = ['<s>']
    
    for u, a in zip(users, assistants):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s>')

    texts.append(f'[INST] {user_query.strip()} [/INST]')
    prompt = ''.join(texts).strip()
    return prompt

In [7]:
tokenizer.pad_token = tokenizer.unk_token

In [8]:
def predict(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False,padding=True).to('cuda')
    generate_kwargs = dict(
        inputs,
        max_new_tokens=1024,
        top_p=0.95,
        top_k=50,
        temperature=0.3,
        do_sample=True,
        num_beams=1,
    )
    response = model.generate(**generate_kwargs).to('cpu')
    return response

In [9]:
prompt_list = []
pair_set = set()

for x in tqdm(range(0, 1000)):
    prompt = []

    random_pairs = random.sample(data, 2)
    id_pair = [f['id'] for f in random_pairs]

    if tuple(id_pair) not in pair_set:  # Use tuple(id_pair) instead of id_pair
        
        picture_1 = random_pairs[0]["conversations"][1]['value']
        picture_2 = random_pairs[1]["conversations"][1]['value']

        messages = [{'role': 'user',
                      'content': f"""
                      Picture 1: {picture_1} 
                      Picture 2: {picture_2}
                      What is related between picture 1 and picture 2."""}]

        prompt.append(parse_mistral_chat(messages))

        response = predict(prompt)

        for i in range(len(prompt)):
            decoded_response = tokenizer.decode(response[i], skip_special_tokens=True)

            conversations = [
                {"role": "user", "content": f"<image><image>What is related between picture 1 and picture 2?"},
                {"role": "assistant", "content": decoded_response.split('[/INST]')[1]}
            ]

            data_input = {
                "image": [random_pairs[0]['image'], random_pairs[1]['image']],
                "conversations": conversations
            }

            with open('blip_laion_cc_sbu_558k_multiimage.jsonl', 'a') as fopen:
                json.dump(data_input, fopen)
                fopen.write('\n')

        pair_set.add(tuple(id_pair))  # Add the tuple to pair_set


  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/1000 [00:06<1:56:26,  6.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/1000 [00:08<1:06:49,  4.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/1000 [00:11<58:40,  3.53s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/1000 [00:14<51:44,  3.12s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/1000 [00:30<2:11:07,  7.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/1000 [00:33<1:39:13,  5.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/1000 [00:35<1:22:05,  4.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 8/1000 [00:37<1:05:10,  3.94s/it]Setting `pad_token