In [None]:
!pip install -q moviepy
!pip install -q llama-index langchain
!pip install -q llama-index-embeddings-huggingface

In [None]:
from tqdm import tqdm
import os
from PIL import Image
import cv2
import moviepy.editor as mp
import zipfile
from llama_index.core import Document, Settings, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
from transformers import pipeline

In [None]:
# loading trained model
image_captioner = pipeline("image-to-text", model="/kaggle/input/fyp-dataset-ego4d/image-captioning-output", device = 'cuda')

# loading embeddings model
embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    device='cuda'
)

In [None]:
video_path = ""
query = ''

In [None]:
def total_frames(video_path):
    clip = mp.VideoFileClip(video_path)
    frame_rate = clip.fps
    duration = clip.duration
    total_frames = int(frame_rate * duration)
    clip.close()
    return total_frames

total_frames_count = total_frames(video_path)
print("Total number of frames:", total_frames_count)

In [None]:
def extract_frames(video_path, output_folder):
    video_id = video_path.split("/")[-1].split(".")[0]
    clip = mp.VideoFileClip(video_path)
    frame_rate = clip.fps
    total_frames = int(clip.duration * frame_rate)

    for i in tqdm(range(0, total_frames, 15)):
        frame_number = i
        frame = clip.get_frame(i / frame_rate)
        frame_path = f"{output_folder}{video_id}_{frame_number}.jpg"
        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        img.save(frame_path)

    clip.close()

In [None]:
output_folder = "/kaggle/working/output/"
os.makedirs(output_folder, exist_ok=True)
extract_frames(video_path, output_folder)

In [None]:
folder_path = "/kaggle/working/output"
files = os.listdir(folder_path)
image_files = [file for file in files if file.endswith((".jpg", ".jpeg", ".png", ".gif"))]
test_image_paths = [os.path.join(folder_path, file) for file in image_files]

In [None]:
pred = {}
prev_caption = None
for i in tqdm(range(len(test_image_paths))):
    caption = image_captioner(test_image_paths[i])
    if caption != prev_caption:
        prev_caption = caption
        frame_number_str = test_image_paths[i].split("_")[-1]
        frame_number_str = frame_number_str.split(".")[0]
        pred[int(frame_number_str)] = caption

In [None]:
sorted_dict = {k: pred[k] for k in sorted(pred)}

In [None]:
documents = []

for key, values in sorted_dict.items():
    for value in values:
        frame_number = key
        text = value['generated_text']
        document = Document(
            text=text,
            metadata={'frame': frame_number},
            metadata_template="{key}=>{value}",
        )
        documents.append(document)


Settings.embed_model = embed_model
parser = LangchainNodeParser(RecursiveCharacterTextSplitter())
nodes = parser.get_nodes_from_documents(documents)

In [None]:
retriever = VectorStoreIndex(
    nodes
).as_retriever(similarity_top_k = 20)

In [None]:
response = retriever.retrieve(query)

r_frames = []
for r in response:
    print(r.text, r.metadata)
    r_frames.append(r.metadata['frame'])

In [None]:
r_frames = sorted(r_frames)
i = 0
for i in tqdm(range(0,len(r_frames)-1)):
    forward_diff = r_frames[i] + 120
    backward_diff = r_frames[i + 1] - 120
    if forward_diff > backward_diff:
        difference = abs((r_frames[i+1]) - r_frames[i] + 120)
        r_frames[i+1] += difference + 120
    i += 1

In [None]:
r_frames = list(dict.fromkeys(r_frames))
top_10_frames = r_frames[0:10]

In [None]:
def frame_generator(video_path, start_frame, end_frame):
    video = cv2.VideoCapture(video_path)
    for idx in range(start_frame, end_frame):
        video.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = video.read()
        if ret:
            yield frame
    video.release()
    
def extract_frames(frame_number, video_path):
    e_frames = [frame for frame in frame_generator(video_path, frame_number - 120, frame_number + 120)]
    return e_frames


e_frames = list()
for i in tqdm(range(len(top_10_frames))):
    frames = extract_frames(top_10_frames[i], video_path)
    e_frames += frames
    
len(e_frames)

In [None]:
def create_video(frames, output_path, fps):
    height, width, _ = frames[0].shape

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for frame in frames:
        out.write(frame)

    out.release()

output_path = '/kaggle/working/output_video.mp4'
fps = 30

create_video(e_frames, output_path, fps)