In [1]:
!nvidia-smi

Fri Feb  9 03:32:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000001:00:00.0 Off |                    0 |
| N/A   55C    P0             342W / 400W |  63249MiB / 81920MiB |     71%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000002:00:00.0 Off |  

In [2]:
from modeling_vision import MM_LLMs, MM_LLMs_Config
from transformers import CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List

In [3]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('vision-alignment-qwen0.5')
latest

'vision-alignment-qwen0.5/checkpoint-400'

In [4]:
model = MM_LLMs.from_pretrained(
    latest,
)

Some weights of MM_LLMs were not initialized from the model checkpoint at vision-alignment-qwen0.5/checkpoint-400 and are newly initialized: ['llm.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
tokenizer = AutoTokenizer.from_pretrained(latest)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokenizer

Qwen2TokenizerFast(name_or_path='vision-alignment-qwen0.5/checkpoint-400', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	151647: AddedToken("</image>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	151648: AddedToken("<audio>", rstrip=False, lstrip=Fal

In [7]:
model.llm.generation_config.eos_token_id = tokenizer.eos_token_id
model.llm.generation_config

GenerationConfig {
  "bos_token_id": 151643,
  "eos_token_id": 151645
}

In [8]:
from PIL import Image
import librosa
import torch
import numpy as np
from collections.abc import Mapping

class DataCollator():

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([],dtype=torch.int)
        batch['image_index'] = torch.tensor([],dtype=torch.int)
        
        for index, feature in enumerate(features):
            local_index = index % (bs) 

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor([local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in ("audios","images") and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios","images"):
                if v is None:
                    batch[k] = None
                else:         
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])

        batch['image_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)

        return batch

collator = DataCollator(tokenizer)

In [9]:
height = image_processor.image_processor.size['height']

In [10]:
def prepare_dataset(messages, images: List[str] = None):
    if images is not None:
        images = [Image.open(f) for f in images]
        image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    else:
        image_output = None
    
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False)

    outputs['images'] = image_output
    return outputs

In [11]:
messages = [
    {'role': 'user', 'content': '<image> </image> ini gambar apa'},
]
outputs = prepare_dataset(messages, images = ['motosikal.jpeg'])
ok = collator([outputs])
ok['labels'] = ok['input_ids']

# for k in ok.keys():
#     if ok[k] is not None:
#         ok[k] = ok[k].cuda()
        
# for k in ['audios', 'images']:
#     if ok[k] is not None:
#         ok[k] = ok[k].type(model.dtype)

In [12]:
with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**ok)
r = model_inputs.pop('input_ids', None)
label = model_inputs.pop('labels', None)
label = label.detach().cpu().numpy()
ok['input_ids'].shape, model_inputs['inputs_embeds'].shape

(torch.Size([1, 12]), torch.Size([1, 588, 1024]))

In [13]:
model = model.type(torch.bfloat16)

In [14]:
model.push_to_hub('malaysian-Qwen1.5-0.5B-vision-alignment', organization='mesolitica', safe_serialization=True)



model.safetensors:   0%|          | 0.00/1.65G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-vision-alignment/commit/95621446b08c1b4e877db9df08dcc536d566a5b7', commit_message='Upload MM_LLMs', commit_description='', oid='95621446b08c1b4e877db9df08dcc536d566a5b7', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
image_processor.push_to_hub('malaysian-Qwen1.5-0.5B-vision-alignment', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-vision-alignment/commit/9e75b4e0b1774489eb1a61658bb1ae1ed817a13b', commit_message='Upload processor', commit_description='', oid='9e75b4e0b1774489eb1a61658bb1ae1ed817a13b', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
tokenizer.push_to_hub('malaysian-Qwen1.5-0.5B-vision-alignment', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-vision-alignment/commit/32005ef017a0627a52b5545f8f962bbf20a31bdc', commit_message='Upload tokenizer', commit_description='', oid='32005ef017a0627a52b5545f8f962bbf20a31bdc', pr_url=None, pr_revision=None, pr_num=None)