In [1]:
import os
import sys
import torch

from PIL import Image

sys.path.append("../")
from vigc.models import load_model_and_preprocess

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [2]:
class DotDict(dict):
    def __init__(self, *args, **kwargs):
        super(DotDict, self).__init__(*args, **kwargs)

    def __getattr__(self, key):
        if key not in self.keys():
            return None
        value = self[key]
        if isinstance(value, dict):
            value = DotDict(value)
        return value
    
    def __setattr__(self, key, value):
        self[key] = value

## 1. load model

In [4]:
model_args = {
    "arch": "blip2_vicuna_instruct",                     # model arch
    "model_type": "vicuna7b",                            # model type

    "pretrained": "/mnt/petrelfs/hanxiao/input/instruct-blip/blip2_pretrained_flant5xxl.pth",
    "finetuned": "/mnt/lustre/wufan/project/vigc/vigc/output/ckpt/vigc7b_stage2/add_detail_rep2/20230811214/checkpoint_2.pth",
}

args = DotDict(model_args)

device = torch.device("cuda:0")

In [5]:
model, vis_processors, _ = load_model_and_preprocess(
    name=args.arch,
    model_type=args.model_type,
    is_eval=True,
    device=device,
)

_ = model.load_checkpoint(args.pretrained)
_ = model.load_checkpoint(args.finetuned)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.21s/it]


## 2. chat demo

In [6]:
def chat_demo(model, img_path, prompt):
    image = Image.open(img_path)
    image = vis_processors["eval"](image).unsqueeze(0).to(device)

    samples = {
        "image": image,
        "prompt": prompt,
    }

    output = model.generate(
        samples,
        length_penalty=float(1),
        repetition_penalty=float(1),
        num_beams=5,
        max_length=250,
        min_length=1,
        top_p=0.9,
        use_nucleus_sampling=False,
    )
    return output[0]

In [7]:
chat_demo(model, "demo_1.jpg", "Generate a question based on the content of the given image and then answer it.")



'Question: What color is the motor scooter in the image? Answer: The motor scooter in the image is silver.'

In [8]:
chat_demo(model, "demo_1.jpg", "Describe the image for me.")

'The image features a silver motor scooter or moped parked inside a garage, next to a brick wall. The motor scooter appears to be parked in a clean and well-maintained environment, with no visible dirt or debris around it. The silver color of the motor scooter adds a sleek and modern touch to the overall appearance of the image.'

In [10]:
chat_demo(model, "demo_2.jpg", "How many people in this image?")

'There are two people in this image.'