In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, AutoConfig
from PIL import Image
import torch
from train_intern_qwen_debug import VLMConfig, VLM
import warnings
warnings.filterwarnings('ignore')

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

setup_seed(123)


model_path = "/root/autodl-tmp/checkpoint-873"
tokenizer = AutoTokenizer.from_pretrained(model_path)
AutoConfig.register("vlm_model", VLMConfig)
AutoModelForCausalLM.register(VLMConfig, VLM)
# model = AutoModelForCausalLM.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config)
model = model.to("cuda")
model.eval()

from processing_intern_vit import load_image

image_path = "/root/autodl-tmp/image.png"
pixel_values = load_image(image_file=image_path)
num_tiles, _, image_size, _ = pixel_values.shape # 13 3 448 448
print(pixel_values.shape)
# 13*256 = 13 * 32**2 / 4
num_image_patches = image_size // 14 # 448 / 14 = 32
num_image_tokens = num_tiles * num_image_patches * num_image_patches / 4
num_image_tokens = torch.tensor(num_image_tokens, dtype=torch.long)
print(num_image_tokens)
q_text = tokenizer.apply_chat_template([{"role":"system", "content":'You are a helpful assistant.'}, 
                                        {"role":"user", "content":'Render a clear and concise summary of the photo \n<image>'}], \
                                        tokenize=False, \
                                        add_generation_prompt=True).replace('<image>', '<|image_pad|>'*num_image_tokens)

print(q_text)
inputs = tokenizer(q_text, return_tensors='pt')
print(inputs['input_ids'])

from torch.nn import functional as F
max_new_tokens = 20
temperature = 1
eos = tokenizer.eos_token_id
top_k = None
input_ids = inputs['input_ids']
s = input_ids.shape[1]
input_ids = input_ids.to("cuda")
pixel_values = pixel_values.to("cuda")

while input_ids.shape[1] < s + max_new_tokens - 1:  
    inference_res = model(input_ids, None, pixel_values)  
    logits = inference_res.logits 
    logits = logits[:, -1, :] 

    for token in set(input_ids.tolist()[0]):  
        logits[:, token] /= 1.0

    if temperature == 0.0: 
        _, idx_next = torch.topk(logits, k=1, dim=-1)
    else:
        logits = logits / temperature  
        if top_k is not None:  
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf') 

        probs = F.softmax(logits, dim=-1)  
        idx_next = torch.multinomial(probs, num_samples=1, generator=None)  

    if idx_next == eos:  
        break

    input_ids = torch.cat((input_ids, idx_next), dim=1)  
print(input_ids[:, s:])
print(tokenizer.decode(input_ids[:, s:][0]))


  from .autonotebook import tqdm as notebook_tqdm


FlashAttention is not installed.
torch.Size([7, 3, 448, 448])
tensor(1792)
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Render a clear and concise summary of the photo 
<|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_