In [None]:
import torch
from unsloth import FastLanguageModel
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import cv2
import numpy as np

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

max_seq_length = 4096
dtype = None
load_in_4bit = True

model_llm, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model_llm)

video_path = '/content/footage_113622770.mp4'
cap = cv2.VideoCapture(video_path)


frame_count = 0
captions = []
frame_context = []

def are_frames_similar(frame1, frame2, threshold=0.95):
    # Convert frames to grayscale
    frame1_gray = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    frame2_gray = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

    from skimage.metrics import structural_similarity as ssim
    score, _ = ssim(frame1_gray, frame2_gray, full=True)

    return score > threshold

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if prev_frame is not None and are_frames_similar(prev_frame, frame):
        frame_count += 1
        continue

    img_filename = f"frame_{frame_count}.jpg"
    cv2.imwrite(img_filename, frame)

    image = Image.open(img_filename)
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    captions.append(caption)
    print(f"Frame {frame_count}: {caption}")

    prev_frame = frame
    frame_count += 1

cap.release()


==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Frame 0: a bunch of flowers in a vase
Frame 3: a bunch of flowers in a vase
Frame 6: a bunch of flowers in a vase
Frame 8: a bunch of flowers in a vase
Frame 10: a bunch of flowers in a vase
Frame 12: a bunch of flowers in a vase
Frame 14: a bunch of flowers in a vase
Frame 16: a bunch of flowers in a vase
Frame 18: a bunch of flowers in a vase
Frame 20: a bunch of flowers in a vase
Frame 22: a bunch of flowers in a vase
Frame 24: a bunch of flowers in a vase
Frame 26: a bunch of flowers in a vase
Frame 28: a bunch of flowers in a vase

In [95]:
def filtered_captions(captions):
  filtered_captions = []
  for i, frame in enumerate(captions):
      if i == 0 or frame != captions[i - 1]:
          filtered_captions.append(frame)
  return filtered_captions

def generate_keywords_with_llm(filtered_captions):
    context = " ".join(captions)
    alpaca_prompt = """
I have the following descriptions of video frames, each representing a unique scene:

{context}

From these descriptions, generate a list of distinct, descriptive keywords that best represent the content of the video frames. The keywords should reflect the primary elements, actions, and objects in the scenes described. Ensure that the keywords are specific and avoid redundancy. The list should include things like objects, actions, environments, and any other important details from the captions.
Please output the keywords as a LIST of singular word, and ENSURE THAT the keywords are DISTINCT and NOT REPEATED. Make sure the words are NOT THE SAME AND REDUNDANT.

Descriptive keywords:
"""
    formatted_prompt = alpaca_prompt.format(context=context)

    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    outputs = model_llm.generate(**inputs, max_new_tokens=64, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]


    pattern = r"Descriptive keywords:\s*(.*)"

    match = re.search(pattern, response)

    if match:
        keywords = match.group(1)
    else:
        print("No match found.")

    return keywords


final_keywords = generate_keywords_with_llm(filtered_captions(captions))
print("Final Descriptive Keywords:", final_keywords)

def generate_captions_with_llm(filtered_captions):
    context = " ".join(captions)
    alpaca_prompt = """
I have the following descriptions of video frames, each representing a unique scene:

{context}

From these descriptions, generate a caption for the whole duration of the video that best represent the content of the video frames. The caption should reflect the primary elements, actions, and objects in the scenes described.
Please output the caption in a text format. Make sure it is desciptive and tells a story about the video in 2 sentences ONLY in a paragraph.

Response:
"""
    formatted_prompt = alpaca_prompt.format(context=context)

    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    outputs = model_llm.generate(**inputs, max_new_tokens=64, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    pattern = r"Response:\s*(.*)"

    match = re.search(pattern, response)

    if match:
        keywords = match.group(1)
    else:
        print("No match found.")

    return keywords


final_captions = generate_captions_with_llm(filtered_captions(captions))
print("Final Captions:", final_captions)

Final Descriptive Keywords: flowers, vase, bouquet, table, candle, greene, white, pink, roses, player, sitting, video
Final Captions: The video shows a bunch of flowers in a vase. The vase is filled with a bouquet of flowers. The flowers are in a variety of colors, including white, pink, and green. The flowers are arranged in a bouquet on a table. The table is in a room with a video player. The video player is


In [None]:
pip install gradio transformers langchain opencv-python openai


In [None]:
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"