In [None]:
# 🔁 Install dependencies, upload video, extract frames, caption, and summarize
!pip install transformers torch torchvision accelerate opencv-python Pillow --quiet

import os
import cv2
import torch
from PIL import Image
from collections import Counter
from transformers import BlipProcessor, BlipForConditionalGeneration
from google.colab import files

# 🔼 Upload video
print("📤 Please upload your video file (.mp4, .mov)...")
uploaded = files.upload()
video_path = next((f for f in uploaded if f.endswith(('.mp4', '.mov', '.avi'))), None)
if not video_path:
    raise ValueError("❌ No supported video file uploaded.")

# 🧠 Setup device and model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# 🎞️ Extract keyframes
def extract_keyframes(video_path, interval=2):
    os.makedirs("frames", exist_ok=True)
    vidcap = cv2.VideoCapture(video_path)
    success, image = vidcap.read()
    count, sec = 0, 0

    while success:
        if int(sec) % interval == 0:
            frame_path = f"frames/frame{count}.jpg"
            cv2.imwrite(frame_path, image)
            count += 1
        sec += 1
        vidcap.set(cv2.CAP_PROP_POS_MSEC, sec * 1000)
        success, image = vidcap.read()
    print(f"✅ Extracted {count} keyframes.")

# 🧾 Caption image
def caption_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# 🧹 Clean and merge captions
def deduplicate_and_merge(captions, threshold=1):
    freq = Counter(captions)
    unique_captions = [cap for cap, count in freq.items() if count > threshold]
    if not unique_captions:
        unique_captions = list(set(captions))
    return ". ".join(unique_captions).capitalize() + "."

# 🧠 Main execution
def generate_video_description(video_path):
    extract_keyframes(video_path, interval=2)
    frame_files = sorted([f"frames/{f}" for f in os.listdir("frames") if f.endswith(".jpg")])
    captions = []

    print("\n🔍 Generating captions for frames...\n")
    for frame in frame_files:
        cap = caption_image(frame)
        print(f"{frame} ➜ {cap}")
        captions.append(cap)

    description = deduplicate_and_merge(captions)
    print("\n🎬 Final Video Description:\n")
    print(description)

generate_video_description(video_path)