In [1]:
import cv2
import numpy as np

In [2]:
def save_keyframes(video_path, output_folder):
    videoCapture = cv2.VideoCapture(video_path)
    success, frame = videoCapture.read()
    i = 0
    while success:
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        hist = cv2.calcHist([gray_frame], [0], None, [256], [0, 256])
        
        success, next_frame = videoCapture.read()
        if not success:
            break
        
        next_gray_frame = cv2.cvtColor(next_frame, cv2.COLOR_BGR2GRAY)
        
        next_hist = cv2.calcHist([next_gray_frame], [0], None, [256], [0, 256])
        
        similarity = cv2.compareHist(hist, next_hist, cv2.HISTCMP_CORREL)
        
        if similarity < 0.9:
            i += 1
            cv2.imwrite(f"{output_folder}/keyframe_{i}.jpg", frame)
            print(f"Saved keyframe {i}")
        
        frame = next_frame

    videoCapture.release()
    return i

In [3]:
num = save_keyframes('./video/car.mp4', './output')

Saved keyframe 1
Saved keyframe 2
Saved keyframe 3
Saved keyframe 4
Saved keyframe 5
Saved keyframe 6
Saved keyframe 7
Saved keyframe 8
Saved keyframe 9
Saved keyframe 10
Saved keyframe 11
Saved keyframe 12


In [4]:
from PIL import Image
import requests, base64

In [5]:
images = [] 
placeholder = "" 
for i in range(1,num+1): 
    with open("./output/keyframe_"+str(i)+".jpg", "rb") as f:

        images.append(Image.open("./output/keyframe_"+str(i)+".jpg"))
        placeholder += f"<|image_{i}|>\n"
        # print(i)

In [6]:
import mlx.core as mx
from mlx_vlm import load, generate

In [7]:
model_path = "./phi-3.5-vision-mlx-int4"
model, processor = load(model_path,processor_config={"trust_remote_code":"True"})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
messages = [
                {"role": "user", "content": "Summarize the video."}, 
]

In [9]:
prompt = processor.tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

In [10]:
images

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x360>]

In [11]:
output = generate(model, processor, images, placeholder+prompt, verbose=False, max_tokens=1024)


In [12]:
output

"The video showcases a red Ferrari B12 Superfast car, highlighting its sleek design and luxurious features. The car is displayed from various angles, emphasizing its aerodynamic shape and the Ferrari logo on the front grille. The video also includes a close-up of the car's interior, showcasing the leather seats and the modern dashboard. The car is then seen driving on a winding road, demonstrating its performance capabilities.<|end|>"


---

**Отказ от ответственности**:  
Этот документ был переведен с помощью сервиса автоматического перевода [Co-op Translator](https://github.com/Azure/co-op-translator). Несмотря на наши усилия по обеспечению точности, автоматические переводы могут содержать ошибки или неточности. Оригинальный документ на его родном языке следует считать авторитетным источником. Для получения критически важной информации рекомендуется профессиональный перевод человеком. Мы не несем ответственности за любые недоразумения или неправильные интерпретации, возникающие в результате использования данного перевода.
