In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2',torch_dtype = torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model.cuda()

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [4]:
data = []

with open('dataset/motomalaysia-chat.json') as fopen:
    data = json.load(fopen)

In [5]:
unique_conversations = []
conversation_set = set()

for data_ in data:
    conversations = data_['conversations'][1]['content']
    if conversations not in conversation_set:
        unique_conversations.append(data_)
        conversation_set.add(conversations)

In [6]:
def parse_mistral_chat(messages, function_call = None):

    user_query = messages[-1]['content']

    users, assistants = [], []
    for q in messages[:-1]:
        if q['role'] == 'user':
            users.append(q['content'])
        elif q['role'] == 'assistant':
            assistants.append(q['content'])

    texts = ['<s>']
    
    for u, a in zip(users, assistants):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s>')

    texts.append(f'[INST] {user_query.strip()} [/INST]')
    prompt = ''.join(texts).strip()
    return prompt

In [7]:
tokenizer.pad_token = tokenizer.unk_token

In [8]:
def predict(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False,padding=True).to('cuda')
    generate_kwargs = dict(
        inputs,
        max_new_tokens=1024,
        top_p=0.95,
        top_k=50,
        temperature=0.3,
        do_sample=True,
        num_beams=1,
    )
    response = model.generate(**generate_kwargs).to('cpu')
    return response

In [10]:
prompt_list = []

for x in tqdm(range(0, len(unique_conversations), 2)):
    
    prompt = []
    
    for i in range(x, x + 2):
        
        picture_1 = unique_conversations[i]["conversations"][1]['content']
        picture_2 = unique_conversations[i + 1]["conversations"][1]['content']    

        messages = [{'role': 'user',
                      'content': f"""
                      Picture 1: {picture_1} 
                      Picture 2: {picture_2}
                      What is related between picture 1 and picture 2."""}]
        
        prompt.append(parse_mistral_chat(messages))

    response = predict(prompt)

    for i in range(len(prompt)):
        decoded_response = tokenizer.decode(response[i],skip_special_tokens=True)
        
        conversations = [
            {"role": "user", "content": "<image><image>What is related between picture 1 and picture 2?"},
            {"role": "assistant", "content": decoded_response.split('[/INST]')[1]}
        ]
        
        data_input = {"image": [unique_conversations[x + i]['image'], unique_conversations[x + i + 1]['image']],
                      
                      "conversations": conversations}
        
        with open('moto-multiimage-chat.jsonl', 'a') as fopen:
            json.dump(data_input, fopen)
            fopen.write('\n')

  0%|          | 0/112 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/112 [00:02<05:31,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/112 [00:06<05:50,  3.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 3/112 [00:15<11:06,  6.12s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▎         | 4/112 [00:19<09:01,  5.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 5/112 [00:22<07:50,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  5%|▌         | 6/112 [00:26<07:34,  4.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▋         | 7/112 [00:29<06:37,  3.78s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  7%|▋         | 8/112 [00:31<05:49,  3.36s/it]Setting `pad_token_id` to `eos_token_id`:

IndexError: list index out of range

In [13]:
moto = []

with open('moto-multiimage-chat.jsonl') as fopen:
    
    for x in fopen:
        moto.append(json.loads(x))

In [16]:
len(moto)

222

In [21]:
moto[0:8]

[{'image': ['pic/Suzuki GSX-S1000GT (2022)_0.jpg',
   'pic/Suzuki Burgman 400 (2021)_0.jpg'],
  'conversations': [{'role': 'user',
    'content': '<image><image>What is related between picture 1 and picture 2?'},
   {'role': 'assistant',
    'content': ' Both picture 1 and picture 2 feature motorcycles manufactured by Suzuki. The first picture shows the Suzuki GSX-S1000GT (2022), which is a sports bike model. The second picture shows the Suzuki Burgman 400 (2021), which is a scooter model.\n\nAlthough they are different types of motorcycles, they are related in that they are both produced by the same manufacturer, Suzuki.'}]},
 {'image': ['pic/Suzuki Burgman 400 (2021)_0.jpg',
   'pic/Suzuki Katana (2021)_0.jpg'],
  'conversations': [{'role': 'user',
    'content': '<image><image>What is related between picture 1 and picture 2?'},
   {'role': 'assistant',
    'content': ' Both picture 1 and picture 2 feature motorcycles manufactured by Suzuki. The Suzuki Burgman 400 is a scooter, while

In [22]:
from huggingface_hub import notebook_login

In [23]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="moto-multiimage-chat.jsonl",
    path_in_repo = "moto-multiimage-chat.jsonl",
    repo_id="malaysia-ai/motomalaysia.com-multiturn",
    repo_type="dataset",
)

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/motomalaysia.com-multiturn/commit/1269bd08ea63c341d04911af43ae6b57b21b89c8', commit_message='Upload moto-multiimage-chat.jsonl with huggingface_hub', commit_description='', oid='1269bd08ea63c341d04911af43ae6b57b21b89c8', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
import json 

data = []

with open('mixtral_audio_instruction_multiaudio.jsonl') as fopen:
    
    for x in fopen:
    
        data.append(json.loads(x))

In [6]:
data[0]

{'filename': ['output-audio/3-3095-4.mp3', 'output-audio/3-2622-40.mp3'],
 'context': [{'context_audio_1': 'Dan kat sini dia ada tulis masa dan juga speed. Yang ni kita nak kisar dulu bahan-bahan yang kita dah letak tadi. Untuk ini kita tak perlu nak tukar apa-apa pun. Just tekan je terus button tu dan kita kisar semua bahan-bahan tadi. Okey jom kita tengok kat dalam dia. Okey ni lah hasil dia. Bawang-bawang yang dah dikisar tadi. Tapi kalau nak lagi halus kita boleh ulang semula langkah tadi untuk dikisar sekali lagi. Okey kat sini dia ada tulis scrap down. Maksudnya kita perlu jatuhkan'},
  {'context_audio_2': 'ke kanan, kiri ke kanan tapi still tak jumpa permukaan atas sampailah aku nampak ada air yang berkucak kat atas kepala aku waktu tu dia macam ada pecikan air lah dan aku pun jenguhkan kepala dalam keadaan yang tak tak apa ni tak nampak sangat kabur and aku nampak aku waktu tu macam pemandangan tu macam kena block tau air kan aku macam aku tengok macam tu kat atas bila aku teng