<a href="https://colab.research.google.com/github/nijisakai/ai_assistant/blob/main/VisRAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Required Libraries


In [None]:
!pip install transformers==4.40.2 sentencepiece==0.1.99 decord==0.6.0

# Define Weighted Mean Pooling
In this block, we define the `weighted_mean_pooling` function, which calculates `weighted mean pooling` on the model’s `hidden states`.


In [None]:
import torch

def weighted_mean_pooling(hidden, attention_mask):
    # Apply cumulative sum to the attention mask to compute weighted pooling
    attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)

    # Compute the sum of hidden states weighted by attention and then normalize
    s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
    d = attention_mask_.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

# Define the Encoding Function
Here, we define `encode`, a function that can handle both text and image inputs to generate embeddings.


In [None]:
import torch.nn.functional as F

@torch.no_grad()
def encode(text_or_image_list):
    # Check if input is text or image, then prepare inputs accordingly
    if isinstance(text_or_image_list[0], str):
        inputs = {
            "text": text_or_image_list,
            'image': [None] * len(text_or_image_list),
            'tokenizer': tokenizer
        }
    else:
        inputs = {
            "text": [''] * len(text_or_image_list),
            'image': text_or_image_list,
            'tokenizer': tokenizer
        }

    # Forward pass through the model
    outputs = model(**inputs)
    attention_mask = outputs.attention_mask
    hidden = outputs.last_hidden_state

    # Apply weighted mean pooling and normalize the result
    reps = weighted_mean_pooling(hidden, attention_mask)
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings


# Load VisRAG-Ret
This block loads the `VisRAG-Ret` model and tokenizer from Hugging Face. We also specify `torch.float16` as the data type for compatibility with T4 GPUs.

In [None]:
# Load Model and Tokenizer
from transformers import AutoModel, AutoTokenizer

# Load the VisRAG-Ret model and tokenizer
model_name_or_path = "openbmb/VisRAG-Ret"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# Since the T4 GPU doesn't support torch.bfloat16, we use torch.float16 instead.
model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
model.eval()


# Prepare Input Query and Download Test Images
This block defines sample queries and downloads test images for evaluating the model's capability to match queries with relevant images.


In [None]:
from PIL import Image
import requests
from io import BytesIO

# Define sample query
queries = ["What does a dog look like?"]
INSTRUCTION = "Represent this query for retrieving relevant documents: "
queries = [INSTRUCTION + query for query in queries]

# Download sample images
print("Downloading images...")
passages = [
    Image.open(BytesIO(requests.get(
        'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/cat.jpeg'
    ).content)).convert('RGB'),
    Image.open(BytesIO(requests.get(
        'https://github.com/OpenBMB/VisRAG/raw/refs/heads/master/scripts/demo/retriever/test_image/dog.jpg'
    ).content)).convert('RGB')
]
print("Images downloaded.")


# Compute Embeddings and Calculate Similarity Scores
In this section, we encode the queries and images, then compute similarity scores between the query embedding and each image embedding.


In [None]:
# Encode the queries and images to get embeddings
embeddings_query = encode(queries)
embeddings_doc = encode(passages)

# Calculate similarity scores
scores = (embeddings_query @ embeddings_doc.T)
print("Similarity scores:", scores.tolist())  # [[0.25753140449523926, 0.3385779857635498]], higher score for the dog image


# Use VisRAG-Gen for Generation with Image
Finally, we use the `MiniCPM-V-2` model to generate a response based on the image that best matches the query.


In [None]:
# Load VisRAG-Gen model and tokenizer for generation
# Since the T4 GPU doesn't support torch.bfloat16, we use torch.float16 instead.
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True, torch_dtype=torch.float16).to(device='cuda', dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True)
model.eval()

# Choose the best matching image (dog) based on similarity scores: [[0.25753140449523926, 0.3385779857635498]]
image = passages[1]  # The image representing a dog
msgs = [{'role': 'user', 'content': queries[0]}]

# Generate response based on the query and image
res, context, _ = model.chat(
    image=image,
    msgs=msgs,
    context=None,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.7
)
print("Generated response:", res)