In [1]:
!nvidia-smi

Thu Feb  8 06:50:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |
| N/A   62C    P0              53W / 300W |      9MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from modeling_v2 import MM_LLMs, MM_LLMs_Config
from transformers import CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from processing_multimodal import MMProcessor
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List

In [4]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('multimodal-tinyllama-whisper-small-siglip')
latest

'multimodal-tinyllama-whisper-small-siglip/checkpoint-16200'

In [5]:
model = MM_LLMs.from_pretrained(
    latest,
    use_flash_attention_2 = True,
    torch_dtype = torch.bfloat16
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [6]:
_ = model.cuda()

In [7]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
tokenizer = AutoTokenizer.from_pretrained(latest)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
from PIL import Image
import librosa
import torch
import numpy as np
from collections.abc import Mapping

class DataCollator():

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([],dtype=torch.int)
        batch['image_index'] = torch.tensor([],dtype=torch.int)
        
        for index, feature in enumerate(features):
            local_index = index % (bs // torch.cuda.device_count()) if bs > 1 else index % (bs) 
            if feature['audios'] is not None:
                batch['audio_index'] = torch.cat([batch['audio_index'], torch.tensor([local_index] * len(feature['audios']), dtype=torch.int)])

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor([local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in ("audios","images") and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios","images"):
                if v is None:
                    batch[k] = None
                else:         
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])

        batch['image_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)
        batch['audio_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<audio>')] * bs, dtype=torch.int)
        batch['audio_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</audio>')] * bs, dtype=torch.int)

        return batch

collator = DataCollator(tokenizer)

In [9]:
height = image_processor.image_processor.size['height']

In [10]:
def prepare_dataset(messages, images: List[str] = None, audio: List[str] = None, sr: int = 16000):
    if images is not None:
        images = [Image.open(f) for f in images]
        image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    else:
        image_output = None
    
    
    if audio is not None:
        audio = [librosa.load(f, sr=sr)[0] for f in audio]
        audio_features = audio_processor(audio, sampling_rate=sr, return_tensors='pt',)['input_features']
    else:
        audio_features = None
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False)

    outputs['audios'] = audio_features
    outputs['images'] = image_output
    return outputs

In [11]:
!ls test.*mp3

test.mp3


In [17]:
messages = [
    {'role': 'user', 'content': '<image> </image> ini gambar apa'},
]
outputs = prepare_dataset(messages, images = ['motosikal.jpeg'])
ok = collator([outputs])
ok['labels'] = ok['input_ids']

for k in ok.keys():
    if ok[k] is not None:
        ok[k] = ok[k].cuda()
        
for k in ['audios', 'images']:
    if ok[k] is not None:
        ok[k] = ok[k].type(model.dtype)

In [18]:
with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**ok)
r = model_inputs.pop('input_ids', None)
label = model_inputs.pop('labels', None)
label = label.detach().cpu().numpy()
ok['input_ids'].shape, model_inputs['inputs_embeds'].shape

(torch.Size([1, 17]), torch.Size([1, 594, 2048]))

In [20]:
generate_kwargs = dict(
    model_inputs,
    max_new_tokens=300,
    top_p=0.95,
    top_k=50,
    temperature=0.1,
    do_sample=True,
    num_beams=1,
)

r = model.llm.generate(**generate_kwargs)
tokenizer.decode(r[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s>Ini adalah gambar hitam putih seorang lelaki yang memakai baju putih dan seluar pendek hitam, berdiri di atas papan selaju di kaki lima.</s>'

In [70]:
model.llm.generate(inputs_embeds=model_inputs['inputs_embeds'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


tensor([[    1, 29968, 29893,   567,  5291,   474,   284,   801,   409,  4142,
           275,  2136,   273,   273,  9228,   273,   343,   574,  5972,   652]],
       device='cuda:0')

In [71]:
model.llm.generate(input_ids=ok['input_ids'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


tensor([[    1,   518, 25580, 29962,  9049,   567,  5291,  3095, 29874,   518,
         29914, 25580, 29962, 29968, 29893,   567,  5291,   474,   284,   801]],
       device='cuda:0')

In [67]:
model.llm.generate()

<bound method GenerationMixin.generate of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32004, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (n

In [66]:
tokenizer.decode([29968, 29956])

'KW'

In [65]:
r

tensor([[    1, 29968, 29956,  7024, 29892,   472,   585,   476,  1295,   375,
           399,  1175,   747,   349,   355,   333,  7941, 29892,   474,   284,
           801,   413,  1295,   375,   281,  1175,   747,   343,   574,   652,
         22032,  2679,   273,   443, 29873,  2679,  3031,  3357,  4639,  1175,
           279,   652,   409, 10028,   801,  7697,   801,   652, 26417,   423,
         29889,   306, 29874,   594,   284,   801,   413,  1295,   375,   343,
           574,   286,   996,  1175,   279,  4639,  1175,   279, 12033,   574,
           413,   267,   295,   314, 23402, 29892,   413, 10100,  2455,   273,
         29892,  6025,   413,   331,   801,   381,   273, 20552,   786,   343,
           574, 11137,   292, 29889,   476, 29956,  7024, 24003, 20912,  2778,
           786,   557,   273,   413,  1295,   375,   281,  1175,   747,   343,
           574,   652,  1117,   309,   288,   280, 29882,  3031,  3357,  4639,
          1175,   279,   652,   409, 10028,   801,  

In [51]:
model.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/c6bba1f03376a5eb417c90f24f340e0762db1a83', commit_message='Upload MM_LLMs', commit_description='', oid='c6bba1f03376a5eb417c90f24f340e0762db1a83', pr_url=None, pr_revision=None, pr_num=None)

In [55]:
image_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/30316426c30ffe66da74ae59d76690436e45ff2f', commit_message='Upload processor', commit_description='', oid='30316426c30ffe66da74ae59d76690436e45ff2f', pr_url=None, pr_revision=None, pr_num=None)

In [56]:
audio_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/3d7ea69a7020b6ddaee04bf671df743a2df98fe3', commit_message='Upload processor', commit_description='', oid='3d7ea69a7020b6ddaee04bf671df743a2df98fe3', pr_url=None, pr_revision=None, pr_num=None)

In [57]:
tokenizer.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/e125c93e4cebf300afb60ec5ed3784142f485d8e', commit_message='Upload tokenizer', commit_description='', oid='e125c93e4cebf300afb60ec5ed3784142f485d8e', pr_url=None, pr_revision=None, pr_num=None)