In [1]:
!nvidia-smi

Thu Feb  1 06:17:16 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000001:00:00.0 Off |                    0 |
| N/A   32C    P0    71W / 300W |  64856MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80G...  On   | 00000002:00:00.0 Off |                    0 |
| N/A   33C    P0    70W / 300W |  61188MiB / 81920MiB |      0%      Default |
|       

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [12]:
from modeling import MM_LLMs, MM_LLMs_Config
from transformers import CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from processing_multimodal import MMProcessor
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List

In [4]:
!ls multimodal-tinyllama-whisper-small-siglip

checkpoint-2450  checkpoint-2500


In [5]:
model = MM_LLMs.from_pretrained(
    'multimodal-tinyllama-whisper-small-siglip/checkpoint-2500',
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [6]:
_ = model.cuda()

In [7]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-224')
audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
tokenizer = AutoTokenizer.from_pretrained('multimodal-tinyllama-whisper-small-siglip/checkpoint-2500')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
from PIL import Image
import librosa
import torch
import numpy as np
from collections.abc import Mapping

class DataCollator():

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([],dtype=torch.int)
        batch['image_index'] = torch.tensor([],dtype=torch.int)
        
        for index, feature in enumerate(features):
            local_index = index % (bs // torch.cuda.device_count()) if bs > 1 else index % (bs) 
            if feature['audios'] is not None:
                batch['audio_index'] = torch.cat([batch['audio_index'], torch.tensor([local_index] * len(feature['audios']), dtype=torch.int)])

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor([local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in ("audios","images") and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios","images"):
                if v is None:
                    batch[k] = None
                else:         
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])

        batch['image_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)
        batch['audio_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<audio>')] * bs, dtype=torch.int)
        batch['audio_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</audio>')] * bs, dtype=torch.int)

        return batch

collator = DataCollator(tokenizer)

In [25]:
def prepare_dataset(messages, images: List[str] = None, audio: List[str] = None, sr: int = 16000):
    if images is not None:
        images = [Image.open(f) for f in images]
    else:
        images = np.zeros((1, 3, 224, 224))
    image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    
    if audio is not None:
        audio = [librosa.load(f, sr=sr)[0] for f in audio]
    else:
        audio = np.zeros((sr * 10,))
    audio_features = audio_processor(audio, sampling_rate=sr, return_tensors='pt',)['input_features']
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False)

    outputs['audios'] = audio_features
    outputs['images'] = image_output
    return outputs

In [49]:
messages = [
    {'role': 'user', 'content': '<image> <audio> apa kaitan dengan picture 1 dan audio 1 ni'},
]
outputs = prepare_dataset(messages, ['motosikal.jpeg'], ['test.mp3'])
ok = collator([outputs])

for k in ok.keys():
    if ok[k] is not None:
        ok[k] = ok[k].cuda()
        
for k in ['audios', 'images']:
    if ok[k] is not None:
        ok[k] = ok[k].type(model.dtype)

In [51]:
generate_kwargs = dict(
    ok,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
    eos_token_id=model.llm.config.eos_token_id,
    bos_token_id=model.llm.config.bos_token_id,
    pad_token_id=model.llm.config.pad_token_id
)

r = model.generate(**generate_kwargs)
tokenizer.decode(r[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> [INST]<image><audio> apa kaitan dengan picture 1 dan audio 1 ni [/INST]Berdasarkan teks yang diberikan, nampaknya terdapat dua imej yang disertakan dengan audio. Imej pertama ialah imej yang diambil oleh pembesar suara, dan imej kedua ialah imej yang diambil oleh pembesar suara. Audio yang disertakan dengan imej kedua ialah audio yang diambil oleh pembesar suara.\n\nTanpa lebih banyak konteks, sukar untuk memberikan jawapan yang lebih spesifik. Walau bagaimanapun, berdasarkan maklumat yang diberikan, nampaknya pembesar suara mengambil dua imej dan audio yang berkaitan dengan kedua-dua imej.</s>'

In [52]:
model.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)



README.md:   0%|          | 0.00/232 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/612bd36595c5e39f9daed6e14c7b78f2840199c8', commit_message='Upload MM_LLMs', commit_description='', oid='612bd36595c5e39f9daed6e14c7b78f2840199c8', pr_url=None, pr_revision=None, pr_num=None)

In [53]:
image_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/cfc7fd9d19f95284ad82ee990047adef08afccbb', commit_message='Upload processor', commit_description='', oid='cfc7fd9d19f95284ad82ee990047adef08afccbb', pr_url=None, pr_revision=None, pr_num=None)

In [54]:
audio_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/657a79b51041e009b141a1b97254f0b53c6925dd', commit_message='Upload processor', commit_description='', oid='657a79b51041e009b141a1b97254f0b53c6925dd', pr_url=None, pr_revision=None, pr_num=None)

In [55]:
tokenizer.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/e564d994984589ee1ea0ef1737ad70f081409282', commit_message='Upload tokenizer', commit_description='', oid='e564d994984589ee1ea0ef1737ad70f081409282', pr_url=None, pr_revision=None, pr_num=None)