In [1]:
!nvidia-smi

Mon Feb 12 02:42:08 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000001:00:00.0 Off |                    0 |
| N/A   33C    P0              64W / 400W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000002:00:00.0 Off |  

In [2]:
from modeling_vision import MM_LLMs, MM_LLMs_Config
from transformers import AutoModelForCausalLM, CLIPProcessor, CLIPModel,AutoModel, AutoTokenizer, AutoProcessor,AutoConfig,CLIPConfig, LlamaConfig, WhisperConfig, WhisperModel, LlamaModel, LlamaTokenizer
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torch
import numpy as np
from torch import nn
from streaming import LocalDataset
from typing import List
from transformers import set_seed

set_seed(42)

In [3]:
from transformers.trainer_utils import get_last_checkpoint

latest = get_last_checkpoint('vision-alignment-qwen0.5')
latest

'vision-alignment-qwen0.5/checkpoint-13000'

In [None]:
model = MM_LLMs.from_pretrained(
    latest,flash_attention = True, dtype = torch.bfloat16, torch_dtype = torch.bfloat16
)

In [5]:
model.llm.lm_head.weight = model.llm.model.embed_tokens.weight

In [6]:
_ = model.cuda()

In [7]:
image_processor = AutoProcessor.from_pretrained('google/siglip-base-patch16-384')
tokenizer = AutoTokenizer.from_pretrained(latest)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
from PIL import Image
import librosa
import torch
import numpy as np
from collections.abc import Mapping

class DataCollator():

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer

    def __call__(self, features):

        if not isinstance(features[0], Mapping):
            features = [vars(f) for f in features]

        batch = {}
        bs = len(features)
        first = features[0]

        batch['audio_index'] = torch.tensor([],dtype=torch.int)
        batch['image_index'] = torch.tensor([],dtype=torch.int)
        
        for index, feature in enumerate(features):
            local_index = index % (bs) 

            if feature['images'] is not None:
                batch['image_index'] = torch.cat([batch['image_index'], torch.tensor([local_index] * len(feature['images']), dtype=torch.int)])

        for k, v in first.items():

            if k not in ("audios","images") and not isinstance(v, str):
                if v is None:
                    batch[k] = None
                elif isinstance(v, torch.Tensor):
                    batch[k] = torch.stack([f[k] for f in features]).squeeze(1)
                elif isinstance(v, np.ndarray):
                    batch[k] = torch.tensor(np.stack([f[k] for f in features])).squeeze(1)
            elif k in ("audios","images"):
                if v is None:
                    batch[k] = None
                else:         
                    batch[k] = torch.cat([f[k] for f in features if f[k] is not None])

        batch['image_starts'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('<image>')] * bs, dtype=torch.int)
        batch['image_ends'] = torch.tensor([self.tokenizer.convert_tokens_to_ids('</image>')] * bs, dtype=torch.int)

        return batch

collator = DataCollator(tokenizer)

In [9]:
height = image_processor.image_processor.size['height']

In [10]:
def prepare_dataset(messages, images: List[str] = None):
    if images is not None:
        images = [Image.open(f).convert('RGB') for f in images]
        image_output = image_processor(images=images, return_tensors='pt')['pixel_values']
    else:
        image_output = None
    
    prompt = tokenizer.apply_chat_template(messages, tokenize = False)
    outputs = tokenizer(
                    prompt,
                    return_tensors='pt',
                    return_overflowing_tokens=False,
                    return_length=False)

    outputs['images'] = image_output
    return outputs

In [11]:
messages = [
    {'role': 'user', 'content': '<image> </image> ini gambar apa'},
]
outputs = prepare_dataset(messages, images = ['motosikal.jpeg'])
ok = collator([outputs])
ok['labels'] = ok['input_ids']

for k in ok.keys():
    if ok[k] is not None:
        ok[k] = ok[k].cuda()
        
for k in ['images']:
    if ok[k] is not None:
        ok[k] = ok[k].type(model.dtype)

In [12]:
with torch.no_grad():
    model_inputs = model.prepare_inputs_for_generation(**ok)
r = model_inputs.pop('input_ids', None)
label = model_inputs.pop('labels', None)
label = label.detach().cpu().numpy()
ok['input_ids'].shape, model_inputs['inputs_embeds'].shape

(torch.Size([1, 12]), torch.Size([1, 588, 1024]))

In [13]:
r = model.llm.generate(input_ids = ok['input_ids'], max_new_tokens=20,
    top_p=0.95,
    top_k=50,
    temperature=0.9,
    do_sample=True,
    num_beams=1,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [14]:
tokenizer.decode(r[0])

'<|im_start|>user\n<image> </image> ini gambar apa<|im_end|>\n<|im_start|>assistant\nSaya tidak dapat memberikan anda kod yang berkaitan dengan jenis khusus'

In [15]:
model.push_to_hub('malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment', 
                  organization='mesolitica', safe_serialization=False)



pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment/commit/1f02e3d8eea28541affa4e5a68fb133c9450f5e7', commit_message='Upload MM_LLMs', commit_description='', oid='1f02e3d8eea28541affa4e5a68fb133c9450f5e7', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
image_processor.push_to_hub('malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment', organization='mesolitica', safe_serialization=True)



CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment/commit/35e8fca1972a6ace330c1237da34735e326e7284', commit_message='Upload processor', commit_description='', oid='35e8fca1972a6ace330c1237da34735e326e7284', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
tokenizer.push_to_hub('malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment', organization='mesolitica', safe_serialization=True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-Qwen1.5-0.5B-siglip-base-384-vision-alignment/commit/f569f4d76186bf4d6f4cd8356d02a4bb1c71cfaa', commit_message='Upload tokenizer', commit_description='', oid='f569f4d76186bf4d6f4cd8356d02a4bb1c71cfaa', pr_url=None, pr_revision=None, pr_num=None)