In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import cv2

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load the video and sample frames
video_path = "C:\Users\wama\Downloads\output_detected (2).mp4"
cap = cv2.VideoCapture(video_path)

captions = []
frame_count = 0
sample_rate = 30  # Sample one frame per second for 30fps video

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % sample_rate == 0:
        # Convert to RGB and resize
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process frame
        inputs = processor(images=frame_rgb, return_tensors="pt")
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(caption)

    frame_count += 1

cap.release()

# Output the video description
print("Video Description Summary:")
print(" | ".join(captions))


In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
import cv2
from PIL import Image

# Load the Florence-2 processor and model
model_id = "microsoft/Florence-2-large"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Load video
video_path = r"C:\Users\wama\Downloads\output_detected (2).mp4"
cap = cv2.VideoCapture(video_path)

captions = []
frame_count = 0
sample_rate = 30  # one frame per second for 30fps

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % sample_rate == 0:
        # Convert to PIL RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)

        # Prompt for captioning
        prompt = "<CAPTION>"

        # Preprocess
        inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(model.device)

        # Generate caption
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=inputs["input_ids"],
                pixel_values=inputs["pixel_values"],
                max_new_tokens=50,
                num_beams=3
            )
        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        captions.append(caption)

    frame_count += 1

cap.release()

# Output the video description
print("Video Description Summary:")
print(" | ".join(captions))


In [3]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import cv2

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load the video
video_path = "/content/bedroom.mp4"
cap = cv2.VideoCapture(video_path)

# Get original video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Output video writer setup
output_path = "/content/bedroom_captioned.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

captions = []
frame_count = 0
sample_rate = fps  # Sample one frame per second
current_caption = ""

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % sample_rate == 0:
        # Convert to RGB and generate caption
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        inputs = processor(images=frame_rgb, return_tensors="pt")
        out = model.generate(**inputs)
        current_caption = processor.decode(out[0], skip_special_tokens=True)
        captions.append(current_caption)

    # --- Draw a background rectangle for caption ---
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.9
    thickness = 2
    padding = 10
    text_size, _ = cv2.getTextSize(current_caption, font, font_scale, thickness)
    text_w, text_h = text_size

    # Position: bottom-left with padding
    x, y = 30, height - 40
    rect_x1 = x - padding
    rect_y1 = y - text_h - padding
    rect_x2 = x + text_w + padding
    rect_y2 = y + padding

    # Draw filled white rectangle (background)
    overlay = frame.copy()
    cv2.rectangle(overlay, (rect_x1, rect_y1), (rect_x2, rect_y2), (255, 255, 255), -1)

    # Blend overlay with original frame for transparency
    alpha = 0.6
    frame = cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)

    # Put the caption text in black
    cv2.putText(frame, current_caption, (x, y), font, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)

    # Write the frame to the output video
    out_video.write(frame)
    frame_count += 1

# Release resources
cap.release()
out_video.release()

# Print collected captions
print("Video Description Summary:")
print(" | ".join(captions))


Video Description Summary:
two children jumping on a bed | a little girl standing on top of a bed | a little girl jumping on a bed | a little girl jumping on a bed | two young girls jumping on a bed | a little girl jumping on a bed | a little girl jumping on a bed
