In [1]:
from modeling import MM_LLMs, MM_LLMs_Config
from transformers import CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from processing_multimodal import MMProcessor
import torch
from torch import nn
from streaming import LocalDataset

In [2]:
!ls multimodal-tinyllama-whisper-small-siglip

checkpoint-160	checkpoint-180


In [3]:
model = MM_LLMs.from_pretrained('multimodal-tinyllama-whisper-small-siglip/checkpoint-180')

In [4]:
model.config

MM_LLMs_Config {
  "_name_or_path": "multimodal-tinyllama-whisper-small-siglip/checkpoint-180",
  "architectures": [
    "MM_LLMs"
  ],
  "attention_heads": 8,
  "audio_config": {
    "_name_or_path": "mesolitica/malaysian-whisper-small",
    "activation_dropout": 0.0,
    "activation_function": "gelu",
    "add_cross_attention": false,
    "apply_spec_augment": false,
    "architectures": [
      "WhisperForConditionalGeneration"
    ],
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "begin_suppress_tokens": [
      220,
      50257
    ],
    "bos_token_id": 50257,
    "chunk_size_feed_forward": 0,
    "classifier_proj_size": 256,
    "cross_attention_hidden_size": null,
    "d_model": 768,
    "decoder_attention_heads": 12,
    "decoder_ffn_dim": 3072,
    "decoder_layerdrop": 0.0,
    "decoder_layers": 12,
    "decoder_start_token_id": 50258,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "dropout": 0.0,
    "early_stopping": false,
    "encoder_attention_he

In [5]:
_ = model.cuda()

In [6]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-224')
audio_processor = AutoProcessor.from_pretrained('mesolitica/malaysian-whisper-small')
tokenizer = AutoTokenizer.from_pretrained('multimodal-tinyllama-whisper-small-siglip/checkpoint-180')

# processor = MMProcessor(image_processor=image_processor,tokenizer = tokenizer,audio_processor=audio_processor)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
from PIL import Image

s = """
<image>gambar ni gambar apa?
"""
image = Image.open('pic/1.jpg')
image_output = image_processor(images=image, return_tensors='pt')['pixel_values']

messages = [
    {'role': 'user', 'content': s.strip()},
]

prompt = tokenizer.apply_chat_template(messages, tokenize = False)
outputs = tokenizer(
                prompt,
                return_tensors='pt',
                return_overflowing_tokens=False,
                return_length=False)
outputs['audios'] = None
outputs['images'] = torch.cat([image_output], dim=0)

In [8]:
from PIL import Image
import librosa
s = """
<audio>apa yg dibincangkan?
"""
audio, _ = librosa.load(f'../filter-audio/3-2480-2.mp3', sr=16000)

audio_features = audio_processor(audio, sampling_rate=16_000, return_tensors='pt',)['input_features']

messages = [
    {'role': 'user', 'content': s.strip()},
]

prompt = tokenizer.apply_chat_template(messages, tokenize = False)
outputs = tokenizer(
                prompt,
                return_tensors='pt',
                return_overflowing_tokens=False,
                return_length=False)
outputs['audios'] = torch.cat([audio_features], dim=0)
outputs['images'] = None


In [9]:
import IPython.display as ipd
ipd.Audio('../filter-audio/3-2480-2.mp3')

In [10]:
from collections.abc import Mapping

class DataCollator():

    def __init__(self):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([],dtype=torch.int)
        batch['image_index'] = torch.tensor([],dtype=torch.int)
        
        for index, feature in enumerate(features):
            local_index = index % (bs // torch.cuda.device_count()) if bs > 1 else index % (bs) 
            if feature['audios'] is not None:
                batch['audio_index'] = torch.cat([batch['audio_index'], torch.tensor([local_index] * len(feature['audios']), dtype=torch.int)])

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor([local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in ("audios","images") and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios","images"):
                if v is None:
                    batch[k] = None
                else:         
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])

        batch['image_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)
        batch['audio_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<audio>')] * bs, dtype=torch.int)
        batch['audio_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</audio>')] * bs, dtype=torch.int)

        return batch


In [11]:
collator = DataCollator()

In [12]:
import torch
import numpy as np

ok = collator([outputs])

In [13]:
for k in ok.keys():
    if ok[k] is not None:
        ok[k] = ok[k].cuda()

In [14]:
generate_kwargs = dict(
    ok,
    max_new_tokens=1000,
    temperature=0.2,
    do_sample=True,
    eos_token_id=model.llm.config.eos_token_id,
    bos_token_id=model.llm.config.bos_token_id,
    pad_token_id=model.llm.config.pad_token_id
)

r = model.generate(**generate_kwargs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [15]:
model = model.type(torch.bfloat16)

In [17]:
model.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/e97828cf6ca9c46575995ac875a4f24906088e9f', commit_message='Upload MM_LLMs', commit_description='', oid='e97828cf6ca9c46575995ac875a4f24906088e9f', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
image_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/0816b2d924b1e56ee1f5466073e26cb2cff54673', commit_message='Upload processor', commit_description='', oid='0816b2d924b1e56ee1f5466073e26cb2cff54673', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
audio_processor.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/8275dba370f2dc9454534379d4eda3d840a26eb4', commit_message='Upload processor', commit_description='', oid='8275dba370f2dc9454534379d4eda3d840a26eb4', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
tokenizer.push_to_hub('malaysian-tinyllama-multimodal', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-tinyllama-multimodal/commit/36f9227eabe457ae8dee468c9eb73bc83173e4c9', commit_message='Upload tokenizer', commit_description='', oid='36f9227eabe457ae8dee468c9eb73bc83173e4c9', pr_url=None, pr_revision=None, pr_num=None)