In [1]:
!nvidia-smi

Sat Feb 17 00:03:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000001:00:00.0 Off |                    0 |
| N/A   59C    P0             371W / 400W |  69997MiB / 81920MiB |    100%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000002:00:00.0 Off |  

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
from modeling_combine import MM_LLMs, MM_LLMs_Config
from transformers import AutoModelForCausalLM, CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List

In [4]:
MM_LLMs.register_for_auto_class()
MM_LLMs_Config.register_for_auto_class()

In [5]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('multimodal-tinyllama')
latest

'multimodal-tinyllama/checkpoint-6700'

In [6]:
model = MM_LLMs.from_pretrained(
    latest,flash_attention = True, dtype = torch.bfloat16, torch_dtype = torch.bfloat16
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


In [7]:
_ = model.cuda()

In [8]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
tokenizer = AutoTokenizer.from_pretrained(latest)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
from PIL import Image
import librosa

In [16]:
def prepare_dataset(messages, images: List[str] = None, audio: List[str] = None, sr = 16000):
    
    if images is not None:
        images = [Image.open(f).convert('RGB') for f in images]
        image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    else:
        image_output = None
        
    if audio is not None:
        audio = [librosa.load(f, sr=sr)[0] for f in audio]
        audio_features = audio_processor(audio, sampling_rate=sr, return_tensors='pt',)['input_features']
    else:
        audio_features = None
    
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False
    )

    outputs['images'] = image_output
    outputs['audios'] = audio_features
    
    image_token = tokenizer.convert_tokens_to_ids('<image>')
    audio_token = tokenizer.convert_tokens_to_ids('<audio>')
    
    if image_output is not None:
        len_image = len(image_output)
    else:
        len_image = 0
        
    if audio_features is not None:
        len_audio = len(audio_features)
    else:
        len_audio = 0
        
    outputs['image_index'] = torch.tensor([0] * len_image)
    outputs['image_starts'] = torch.tensor([image_token] * (len_image + 1))
    outputs['audio_index'] = torch.tensor([0] * len_audio)
    outputs['audio_starts'] = torch.tensor([audio_token] * (len_audio + 1))
        
    where_is = torch.where((outputs['input_ids'] == image_token) | (outputs['input_ids'] == audio_token))
    ls = []
    for i in range(len(where_is[0])):
        b, k = where_is[0][i], where_is[1][i]
        l = int(outputs['input_ids'][b, k])
        ls.append(l)

    ls = torch.tensor(ls)
    outputs['where_is_b'] = where_is[0]
    outputs['where_is_k'] = where_is[1]
    outputs['ls'] = ls
        
    return outputs

In [17]:
# !wget https://cdn.beautifulnara.net/wp-content/uploads/2017/12/10201620/Persian-cat-breed.jpg
# !wget https://www.jocooks.com/wp-content/uploads/2023/09/nasi-goreng-1-23.jpg

In [18]:
test_image = 'translated-LLaVA-Instruct-150K/filtered-llava-images/000000033471.jpg'
test_image2 = 'Persian-cat-breed.jpg'
test_image3 = 'abang-gay.png'
test_image4 = 'nasi-goreng-1-23.jpg'
images = [
    test_image,
    test_image2,
    test_image3,
    test_image4
]
audio = 'test.mp3'

In [19]:
for img in images:
    messages = [
        {'role': 'user', 'content': '<image> </image> ini gambar apa'},
    ]
    outputs = prepare_dataset(messages, images = [img])
    if outputs['images'] is not None:
        outputs['images'] = outputs['images'].type(model.dtype)
    if outputs['audios'] is not None:
        outputs['audios'] = outputs['audios'].type(model.dtype)
    for k in outputs.keys():
        if outputs[k] is not None:
            outputs[k] = outputs[k].cuda()

    with torch.no_grad():
        model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)

    r = model_inputs.pop('input_ids', None)
    generate_kwargs = dict(
        model_inputs,
        max_new_tokens=300,
        top_p=0.95,
        top_k=50,
        temperature=0.9,
        do_sample=True,
        num_beams=1,
    )

    r = model.llm.generate(**generate_kwargs)
    print(img, tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


translated-LLaVA-Instruct-150K/filtered-llava-images/000000033471.jpg <s>Dalam imej, terdapat bas bandar pada waktu siang. Ia mempunyai grafiti di atasnya, mungkin untuk meningkatkan penampilannya atau mengubahnya menjadi promosi. Bas itu juga mempunyai iklan yang ditayangkan pada sisi untuk promosi.</s>


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Persian-cat-breed.jpg <s>Imej ini mempamerkan kucing putih yang terletak dalam kedudukan yang selesa, dengan kepalanya menghadap ke luar di atas sofa hitam. Kucing sedang berehat di atas sofa, sama seperti semasa ia berehat.</s>


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


abang-gay.png <s>Dalam imej ini, terdapat seorang lelaki muda yang bergambar di sebelah beg pakaian yang dipakai di atas leher.</s>
nasi-goreng-1-23.jpg <s>Imej ini adalah imej makan malam dengan kuali nasi di atas meja. Nasi dihidangkan pada pinggan putih dan telah dipotong menjadi kepingan yang lebih kecil. Terdapat beberapa lobak merah, kedua-duanya dihidangkan pada pinggan dan di atas meja. Makan malam termasuk berapa jenis sayur-sayuran dan hidangan utama?</s>


In [20]:
messages = [
    {'role': 'user', 'content': '<image> </image> <image> </image> apa kaitan 2 gambar ni'},
]
outputs = prepare_dataset(messages, images = [test_image, test_image2])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Imej pertama adalah tentang bas pelancongan tertentu yang dibawa di jalan. Imej kedua, pula, merakamkan kucing putih yang terletak di atas permaidani, mungkin memerhatikan persekitarannya atau cuba melindunginya daripada pemergiannya yang akan datang.</s>


In [21]:
messages = [
    {'role': 'user', 'content': '<audio> </audio> apa isu audio ni'},
]
outputs = prepare_dataset(messages, images = [test_image], audio = [audio])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> mondiat lagi boleh bagi RM300-RM500 kepada 500 orang. Dan sekarang, mereka tidak faham tujuan objektifnya. Ini untuk menggalakkan orang menggunakan e-wallet. Tetapi, di Malaysia, jika seseorang menghadapi masalah, mereka tidak dapat mengeluarkan wang e-wallet mereka. Sebab itu, tiada sistem yang betul dalam sistem e-wallet di Malaysia.</s>


In [22]:
messages = [
    {'role': 'user', 'content': '<image> </image> <audio> </audio> apa kaitan gambar dan audio ni'},
]
outputs = prepare_dataset(messages, images = [test_image], audio = [audio])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Potongan gambar dan audio yang anda berikan berkata "Anda boleh memberi 300 ringgit ke dalam sistem pemikiran anda, 500 ringgit ke arah penciptaan e-wallet, mengapa orang tidak memahami objektifnya? Mengapa orang tidak mengikut nasihat? Berapa banyak masalahnya di Malaysia? Anda pergi tempat yang tidak selamat. Anda pergi tempat yang tidak selamat, anda pergi tempat yang tidak selamat, anda pergi tempat yang tidak selamat. Anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran, anda pergi ke pasaran,


In [23]:
# Image.open('abang-gay.png')

In [None]:
model.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)

In [None]:
image_processor.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)

In [None]:
audio_processor.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)

In [None]:
tokenizer.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)