In [1]:
!nvidia-smi

Thu Feb 15 17:21:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000001:00:00.0 Off |                    0 |
| N/A   35C    P0              70W / 400W |  17245MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000002:00:00.0 Off |  

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '6'

In [3]:
from modeling_combine import MM_LLMs, MM_LLMs_Config
from transformers import AutoModelForCausalLM, CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List

In [4]:
MM_LLMs.register_for_auto_class()
MM_LLMs_Config.register_for_auto_class()

In [5]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('multimodal-tinyllama')
latest

'multimodal-tinyllama/checkpoint-6700'

In [6]:
model = MM_LLMs.from_pretrained(
    latest,flash_attention = True, dtype = torch.bfloat16, torch_dtype = torch.bfloat16
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


In [7]:
_ = model.cuda()

In [8]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
tokenizer = AutoTokenizer.from_pretrained(latest)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
default_height = image_processor.image_processor.size['height']

In [10]:
from PIL import Image
import librosa

In [11]:
def prepare_dataset(messages, images: List[str] = None, audio: List[str] = None, sr = 16000):
    
    if images is not None:
        images = [Image.open(f).convert('RGB') for f in images]
        image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    else:
        image_output = None
        
    if audio is not None:
        audio = [librosa.load(f, sr=sr)[0] for f in audio]
        audio_features = audio_processor(audio, sampling_rate=sr, return_tensors='pt',)['input_features']
    else:
        audio_features = None
    
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False
    )

    outputs['images'] = image_output
    outputs['audios'] = audio_features
    
    image_token = tokenizer.convert_tokens_to_ids('<image>')
    audio_token = tokenizer.convert_tokens_to_ids('<audio>')
    
    if image_output is not None:
        outputs['image_index'] = torch.tensor([0] * len(outputs['images']))
        outputs['image_starts'] = torch.tensor([image_token] * len(outputs['images']))
    else:
        outputs['image_index'] = torch.tensor([])
        outputs['image_starts'] = torch.tensor([image_token])
        
    if audio_features is not None:
        outputs['audio_index'] = torch.tensor([0] * len(outputs['audios']))
        outputs['audio_starts'] = torch.tensor([audio_token] * len(outputs['audios']))
    else:
        outputs['audio_index'] = torch.tensor([])
        outputs['audio_starts'] = torch.tensor([audio_token])
        
    where_is = torch.where((outputs['input_ids'] == image_token) | (outputs['input_ids'] == audio_token))
    ls = []
    for i in range(len(where_is[0])):
        b, k = where_is[0][i], where_is[1][i]
        l = int(outputs['input_ids'][b, k])
        ls.append(l)

    ls = torch.tensor(ls)
    outputs['where_is_b'] = where_is[0]
    outputs['where_is_k'] = where_is[1]
    outputs['ls'] = ls
        
    return outputs

In [12]:
# !wget https://cdn.beautifulnara.net/wp-content/uploads/2017/12/10201620/Persian-cat-breed.jpg
# !wget https://www.jocooks.com/wp-content/uploads/2023/09/nasi-goreng-1-23.jpg

In [13]:
test_image = 'translated-LLaVA-Instruct-150K/filtered-llava-images/000000033471.jpg'
test_image2 = 'Persian-cat-breed.jpg'
test_image3 = 'abang-gay.png'
test_image4 = 'nasi-goreng-1-23.jpg'
images = [
    test_image,
    test_image2,
    test_image3,
    test_image4
]
audio = 'test.mp3'

In [14]:
for img in images:
    messages = [
        {'role': 'user', 'content': '<image> </image> ini gambar apa'},
    ]
    outputs = prepare_dataset(messages, images = [img])
    if outputs['images'] is not None:
        outputs['images'] = outputs['images'].type(model.dtype)
    if outputs['audios'] is not None:
        outputs['audios'] = outputs['audios'].type(model.dtype)
    for k in outputs.keys():
        if outputs[k] is not None:
            outputs[k] = outputs[k].cuda()

    with torch.no_grad():
        model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)

    r = model_inputs.pop('input_ids', None)
    generate_kwargs = dict(
        model_inputs,
        max_new_tokens=300,
        top_p=0.95,
        top_k=50,
        temperature=0.9,
        do_sample=True,
        num_beams=1,
    )

    r = model.llm.generate(**generate_kwargs)
    print(img, tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


translated-LLaVA-Instruct-150K/filtered-llava-images/000000033471.jpg <s>Imej ini adalah pandangan gambar hidupan liar yang berkemungkinan menghalakan matamanya di jalan bandar. Terdapat juga gambar bas jalanan yang lalu.</s>


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Persian-cat-breed.jpg <s>Ini adalah imej seekor kucing putih berkaki panjang dan gebu dengan mata yang berkibir dan ekspresi yang mesra dan tenang.</s>


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


abang-gay.png <s>Imej itu menunjukkan seorang lelaki yang memakai baju merah dan hitam. Dia berdiri teguh di kawasan yang dipenuhi dengan batu.</s>
nasi-goreng-1-23.jpg <s>Imej ini adalah gambar makan malam. Ia menangkap mangkuk nasi, sayur-sayuran, dan roti yang dihidangkan bersama hidangan. Hidangan nasi dan sayur-sayuran menjadi persembahan yang menarik dan berwarna-warp.

Secara keseluruhan, terdapat sekurang-kurangnya lapan keping sayur-sayuran dan tujuh keping roti dalam gambar ini. Sama ada di atas atau di bawah mangkuk, ia semuanya sama dan menyediakan pilihan persembahan yang hampir sama.</s>


In [15]:
messages = [
    {'role': 'user', 'content': '<image> </image> <image> </image> apa kaitan 2 gambar ni'},
]
outputs = prepare_dataset(messages, images = [test_image, test_image2])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Gambar-gambar ini tidak berkaitan, kerana mereka merupakan penerangan tentang tiga haiwan yang berbeza.</s>


In [16]:
messages = [
    {'role': 'user', 'content': '<audio> </audio> apa isu audio ni'},
]
outputs = prepare_dataset(messages, images = [test_image], audio = [audio])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>powerlift lagi boleh bagi RM300.</s>


In [17]:
messages = [
    {'role': 'user', 'content': '<image> </image> <audio> </audio> apa kaitan gambar dan audio ni'},
]
outputs = prepare_dataset(messages, images = [test_image], audio = [audio])
if outputs['images'] is not None:
    outputs['images'] = outputs['images'].type(model.dtype)
if outputs['audios'] is not None:
    outputs['audios'] = outputs['audios'].type(model.dtype)
for k in outputs.keys():
    if outputs[k] is not None:
        outputs[k] = outputs[k].cuda()

with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**outputs, inference = True)
    
r = model_inputs.pop('input_ids', None)
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Gambar dan audio ini nampaknya berkaitan antara satu sama lain. Gambar menunjukkan sebuah bas dua tingkat dalam perjalanan menuruni jalan yang sibuk. Adegan ini mewujudkan rasa mendesak, kerana penumpang mencari pilihan pembayaran yang lebih berkesan untuk mendapatkan bantuan kewangan.

Sementara itu, audio ini memaparkan perbincangan tentang membuat pembayaran besar seperti 500 ringgit kepada perkhidmatan kewangan, dengan menyatakan bahawa tidak ada sistem e-watak yang tersedia di Malaysia. Penceramah juga membincangkan kelemahan sistem sedemikian, kerana mereka tidak menyediakan bantuan yang diperlukan untuk isu kewangan.</s>


In [18]:
# Image.open('abang-gay.png')

In [19]:
model.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-mmmmodal/commit/494b2c7c7b9e80da0fefdca1e5dd53688ce38457', commit_message='Upload MM_LLMs', commit_description='', oid='494b2c7c7b9e80da0fefdca1e5dd53688ce38457', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
image_processor.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-mmmmodal/commit/49977969dc09c58de42c0cc32727c67f72f87c6e', commit_message='Upload processor', commit_description='', oid='49977969dc09c58de42c0cc32727c67f72f87c6e', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
audio_processor.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-mmmmodal/commit/81bca0f61f1ac646a10829344ad4d3c87172a22a', commit_message='Upload processor', commit_description='', oid='81bca0f61f1ac646a10829344ad4d3c87172a22a', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('malaysian-tinyllama-1.1b-mmmmodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-1.1b-mmmmodal/commit/b1f6d37be94bdc29fe4df78813b8af9f6e86412c', commit_message='Upload tokenizer', commit_description='', oid='b1f6d37be94bdc29fe4df78813b8af9f6e86412c', pr_url=None, pr_revision=None, pr_num=None)