In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2',torch_dtype = torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model.cuda()

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [4]:
data = []

with open('dataset/wapcar-chat.json') as fopen:
    data = json.load(fopen)

In [5]:
unique_conversations = []
conversation_set = set()

for data_ in data:
    conversations = data_['conversations'][1]['content']
    if conversations not in conversation_set:
        unique_conversations.append(data_)
        conversation_set.add(conversations)

In [6]:
len(data),len(unique_conversations)

(51136, 1530)

In [7]:
unique_conversations[0]

{'image': 'pic/2023 Bugatti Veyron 16.4_7.jpg',
 'image_url': 'https://images.wapcar.my/file/4de4c6d733c24bf5949dda63a90b4b73.jpg',
 'conversations': [{'role': 'user', 'content': '<image>Ini model apa?'},
  {'role': 'assistant', 'content': '2023 Bugatti Veyron 16.4'},
  {'role': 'user', 'content': 'apa spesifikasi kereta tu?'},
  {'role': 'assistant',
   'content': ['Retail Price : TBC',
    'Insurance: TBC',
    'Road Tax: TBC',
    'Monthly Payment: TBC',
    'Brand : Bugatti',
    'Body Type : Convertible',
    'Segment : Sports Car',
    'Fuel Type : Petrol',
    'Model : Bugatti Veyron',
    'Launched Year : 2023',
    'Horsepower (ps) : 1014',
    'Torque (Nm) : 1250',
    'Engine : 8L 1014ps',
    'Engine Power(PS) : 1014',
    'Electric Engine(PS) : -',
    'Transmission : AT',
    'Length*Width*Heigh(mm) : 4462 x 1998 x 1204',
    '0-100 km/h (s) : 2.7',
    'Manufacturers Claim(L/100km) : -',
    'As Tested(L/100km) : -',
    'On Sale : yes',
    'Warranty Manufacturer : -',


In [8]:
unique_conversations[1]

{'image': 'pic/2023 Bugatti Veyron 16.4 Grand Sport Vitesse_7.jpg',
 'image_url': 'https://images.wapcar.my/file/0468b2fae3d547ef96144fd53fcd1288.jpg',
 'conversations': [{'role': 'user', 'content': '<image>Ini model apa?'},
  {'role': 'assistant',
   'content': '2023 Bugatti Veyron 16.4 Grand Sport Vitesse'},
  {'role': 'user', 'content': 'apa spesifikasi kereta tu?'},
  {'role': 'assistant',
   'content': ['Retail Price : TBC',
    'Insurance: TBC',
    'Road Tax: TBC',
    'Monthly Payment: TBC',
    'Brand : Bugatti',
    'Body Type : Convertible',
    'Segment : Sports Car',
    'Fuel Type : Petrol',
    'Model : Bugatti Veyron',
    'Launched Year : 2023',
    'Horsepower (ps) : 1217',
    'Torque (Nm) : 1500',
    'Engine : 8L 1217ps',
    'Engine Power(PS) : 1217',
    'Electric Engine(PS) : -',
    'Transmission : AT',
    'Length*Width*Heigh(mm) : 4462 x 1998 x 1204',
    '0-100 km/h (s) : 2.6',
    'Manufacturers Claim(L/100km) : 0 L/100km',
    'As Tested(L/100km) : -',
   

In [9]:
def parse_mistral_chat(messages, function_call = None):

    user_query = messages[-1]['content']

    users, assistants = [], []
    for q in messages[:-1]:
        if q['role'] == 'user':
            users.append(q['content'])
        elif q['role'] == 'assistant':
            assistants.append(q['content'])

    texts = ['<s>']
    
    for u, a in zip(users, assistants):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s>')

    texts.append(f'[INST] {user_query.strip()} [/INST]')
    prompt = ''.join(texts).strip()
    return prompt

In [10]:
tokenizer.pad_token = tokenizer.unk_token

In [11]:
def predict(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False,padding=True).to('cuda')
    generate_kwargs = dict(
        inputs,
        max_new_tokens=1024,
        top_p=0.95,
        top_k=50,
        temperature=0.3,
        do_sample=True,
        num_beams=1,
    )
    response = model.generate(**generate_kwargs).to('cpu')
    return response

In [13]:
prompt_list = []

for x in tqdm(range(0, len(unique_conversations), 3)):
    
    prompt = []
    
    for i in range(x, x + 3):
        
        picture_1 = unique_conversations[i]["conversations"][1]['content']
        picture_2 = unique_conversations[i + 1]["conversations"][1]['content']    

        messages = [{'role': 'user',
                      'content': f"""
                      Picture 1: {picture_1} 
                      Picture 2: {picture_2}
                      What is related between picture 1 and picture 2."""}]
        
        prompt.append(parse_mistral_chat(messages))

    response = predict(prompt)

    for i in range(len(prompt)):
        decoded_response = tokenizer.decode(response[i],skip_special_tokens=True)
        
        conversations = [
            {"role": "user", "content": "<image><image>What is related between picture 1 and picture 2?"},
            {"role": "assistant", "content": decoded_response.split('[/INST]')[1]}
        ]
        
        data_input = {"image": [unique_conversations[x + i]['image'], unique_conversations[x + i + 1]['image']],
                      
                      "conversations": conversations}
        
        with open('wapcar-multiimage-chat-2.jsonl', 'a') as fopen:
            json.dump(data_input, fopen)
            fopen.write('\n')

  0%|          | 0/510 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/510 [00:07<1:02:50,  7.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/510 [00:12<49:45,  5.88s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 3/510 [00:20<57:13,  6.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 4/510 [00:25<52:26,  6.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 5/510 [00:35<1:03:47,  7.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/510 [00:39<54:58,  6.54s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▏         | 7/510 [00:43<47:51,  5.71s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 8/510 [00:50<49:20,  5.90s/it]Setting `pad_token_id` to `eos_to

IndexError: list index out of range

In [8]:
car = []

with open('wapcar-multiimage-chat.jsonl') as fopen:
    
    for x in fopen:
        car.append(json.loads(x))

In [9]:
len(car)

1527

In [5]:
car[10:15]

[{'id': '001649122',
  'image': '00164/001649122.jpg',
  'conversations': [{'from': 'human',
    'value': "<image>\nPresent a compact description of the photo's key features.",
    'value_ms': '<imej>\nHadirkan penerangan padat tentang ciri utama foto.'},
   {'from': 'gpt',
    'value': 'a illustration of a dj in a crowd with the text so dop',
    'value_ms': 'Ilustrasi DJ dalam orang ramai dengan teks yang begitu jelas'}]},
 {'id': '002326944',
  'image': '00232/002326944.jpg',
  'conversations': [{'from': 'human',
    'value': 'Share a concise interpretation of the image provided.\n<image>',
    'value_ms': 'Kongsi tafsiran ringkas imej yang disediakan.\n<img>'},
   {'from': 'gpt',
    'value': 'floor plan of cottage type house, 1 bedroom',
    'value_ms': 'pelan lantai rumah jenis kotej, 1 bilik tidur'}]},
 {'id': '000093116',
  'image': '00009/000093116.jpg',
  'conversations': [{'from': 'human',
    'value': 'Render a clear and concise summary of the photo.\n<image>',
    'value_m

In [18]:
car[10:15]

[{'image': ['pic/2018 Bentley Mulsanne Speed_7.jpg',
   'pic/2018 Aston Martin Vanquish S Super GT_7.jpg'],
  'conversations': [{'role': 'user',
    'content': '<image><image>What is related between picture 1 and picture 2?'},
   {'role': 'assistant',
    'content': ' Both picture 1 and picture 2 feature luxury sports cars produced in 2018. The Bentley Mulsanne Speed and the Aston Martin Vanquish S Super GT are high-end automobiles known for their power, comfort, and style. However, they belong to different automobile manufacturers - Bentley and Aston Martin - and have distinct designs and features. While they share the year of production, there is no direct relation between picture 1 and picture 2 other than they are both luxury sports cars from 2018.'}]},
 {'image': ['pic/2018 Aston Martin Vanquish S Super GT_7.jpg',
   'pic/2019 Alfa Romeo Glulietta_7.jpg'],
  'conversations': [{'role': 'user',
    'content': '<image><image>What is related between picture 1 and picture 2?'},
   {'ro