In [4]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch
import random

In [5]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2')
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2',torch_dtype = torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
model.cuda()

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [10]:
data = []

with open('mixtral-audio-instruction.jsonl') as fopen:
    
    for x in fopen:
        data.append(json.loads(x))

In [15]:
data[0]

{'context': 'anda tahu keuntungan boleh lebih tinggi daripada keuntungan kewangan rumah maka saya tidak akan mencari dalam akaun saya akan mencari ke dalam ethereum atau beberapa crypto punks bergantung pada faktor risiko anda kerana rumah kajang dihantar tidak mengganggu dsr saya sejauh ini jadi sekarang apa posisi saya untuk mendapatkan kewangan ketiga jadi mungkin setelah melihat sekeliling saya menemui seorang penjual yang dapat menutupi perhubungan tetapi bank hanya menerima 70% dari itu saya boleh membayar perbezaan dengan menggunakan wang ini kerana sekali lagi ia menyusahkan saya dan aset tetapi jika anda tidak selesa dengan mencari',
 'chat': [{'role': 'user',
   'content': 'Why might someone consider investing in cryptocurrencies like Ethereum instead of traditional financial investments such as real estate?',
   'content_ms': 'Mengapakah seseorang mungkin mempertimbangkan untuk melabur dalam mata wang kripto seperti Ethereum dan bukannya pelaburan kewangan tradisional sepert

In [12]:
def parse_mistral_chat(messages, function_call = None):

    user_query = messages[-1]['content']

    users, assistants = [], []
    for q in messages[:-1]:
        if q['role'] == 'user':
            users.append(q['content'])
        elif q['role'] == 'assistant':
            assistants.append(q['content'])

    texts = ['<s>']
    
    for u, a in zip(users, assistants):
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s>')

    texts.append(f'[INST] {user_query.strip()} [/INST]')
    prompt = ''.join(texts).strip()
    return prompt

In [13]:
tokenizer.pad_token = tokenizer.unk_token

In [14]:
def predict(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False,padding=True).to('cuda')
    generate_kwargs = dict(
        inputs,
        max_new_tokens=1024,
        top_p=0.95,
        top_k=50,
        temperature=0.3,
        do_sample=True,
        num_beams=1,
    )
    response = model.generate(**generate_kwargs).to('cpu')
    return response

In [None]:
prompt_list = []
pair_set = set()

for x in tqdm(range(0, 1000)):
    prompt = []

    random_pairs = random.sample(data, 2)
    id_pair = [f['context'] for f in random_pairs]

    if tuple(id_pair) not in pair_set:  # Use tuple(id_pair) instead of id_pair
        
        audio_1 = random_pairs[0]["context"]
        audio_2 = random_pairs[1]["context"]

        messages = [{'role': 'user',
                      'content': f"""
                      Audio 1: {audio_1} 
                      Audio 2: {audio_2}
                      What is related between audio 1 and audio 2."""}]

        prompt.append(parse_mistral_chat(messages))

        response = predict(prompt)

        for i in range(len(prompt)):
            decoded_response = tokenizer.decode(response[i], skip_special_tokens=True)

            conversations = [
                {"role": "user", "content": f"<audio><audio>What is related between audio 1 and audio 2?"},
                {"role": "assistant", "content": decoded_response.split('[/INST]')[1]}
            ]

            data_input = {
                "filename": [random_pairs[0]['filename'], random_pairs[1]['filename']],
                "context": [{'context_audio_1' : audio_1},{'context_audio_2':audio_2}],
                "conversations": conversations
            }

            with open('mixtral_audio_instruction_multiaudio.jsonl', 'a') as fopen:
                json.dump(data_input, fopen)
                fopen.write('\n')

        pair_set.add(tuple(id_pair))  # Add the tuple to pair_set


  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/1000 [00:09<2:32:06,  9.14s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/1000 [00:10<1:18:50,  4.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/1000 [00:12<59:05,  3.56s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/1000 [00:18<1:13:04,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/1000 [00:21<1:06:17,  4.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/1000 [00:30<1:33:02,  5.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/1000 [00:37<1:40:44,  6.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 8/1000 [00:43<1:40:52,  6.10s/it]Setting `pad_tok

In [None]:
data_input