# Project Phase 2: Video Dialog 

## Imports

In [None]:
import json
from pprint import pprint

#Open Search
from opensearchpy import OpenSearch

#Embeddings neighborhood
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import pickle
import spacy

#Contextual embeddings and self-attention
import numpy as np
from sklearn.decomposition import PCA
from transformers import AutoConfig, AutoModelForSequenceClassification
from bertviz import model_view, head_view

# Get the interactive Tools for Matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import clip
from PIL import Image
import av
import av.datasets

import os
import yt_dlp

from pathlib import Path
import math

## 2.2 Text-based Search

### Load the video captions

In [None]:
def load_captions_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    processed = {}
    for video_id, captions in data.items():
        processed[video_id] = {
            "segments": captions['segments'] if 'segments' in captions else captions,
        }
    return processed

# Load the data
val_data1 = load_captions_data('captions/val_1.json')
val_data2 = load_captions_data('captions/val_2.json')

# Combine dictionaries (preserving video_id as keys)
all_captions_data = {**val_data1, **val_data2}

pprint(f"Number of captions: {len(all_captions_data)}")
pprint(f"Example Captions: {all_captions_data}")

### Load the videos

In [None]:
with open('activity_net.v1-3.min.json', 'r') as json_data:
    data = json.load(json_data)

database = {}

for video_id in data['database']:
    database["v_" + video_id] = data['database'][video_id]

# Create the list with all data, sorted by the number of annotations
sorted_database = sorted(
    database.items(),
    key=lambda x: len(x[1]['annotations']),
    reverse=True
)

# Top 10 videos in number of annotations
top_videos = dict(sorted_database[:27])

pprint(top_videos)

In [None]:
matching_ids = set(database.keys()) & set(all_captions_data.keys())
print(f"Número de IDs correspondentes: {len(matching_ids)}")
print(f"IDs no top_videos: {list(top_videos.keys())[:5]}...")
print(f"IDs em all_captions_data: {list(all_captions_data.keys())[:5]}...")

### Compute the final captions dataset

In [None]:
final_dataset_captions = {}
#final_dataset_video = {}

# Check and store the captions' of the top 10 videos
for video_id in top_videos:
    try:
        if (all_captions_data[video_id] != None):
            final_dataset_captions[video_id] = all_captions_data[video_id]
            #final_dataset_video[video_id] = top_videos[video_id]
    except Exception as e:
        None

final_dataset_captions.pop("v_PJ72Yl0B1rY", None) # This video has no available URL
#final_dataset_video.pop("v_PJ72Yl0B1rY", None)

pprint(final_dataset_captions)
pprint(len(final_dataset_captions))

### Keyframe extraction

In [None]:
def download_video(video_url, output_path):
    ydl_opts = {
        'format': 'mp4',
        'outtmpl': output_path,
        'quiet': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

In [None]:
def extract_segment_keyframes(video_path, output_dir, t):
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        with av.open(video_path) as container:
            stream = container.streams.video[0]
            stream.codec_context.skip_frame = "NONKEY"
            time_base = stream.time_base  # Needed to convert pts to seconds

            for frame in container.decode(stream):
                timestamp_sec = frame.pts * time_base

                i = 0
                aux = math.inf
                right_ts = -1

                for s in t:
                    # Code to find the closest timestamp 
                    start = float(s[0])
                    end = float(s[1])

                    value = abs(float(timestamp_sec) - start) + abs(float(timestamp_sec) - end)
                    if value < aux and start <= float(timestamp_sec) <= end:
                        aux = value
                        right_ts = i
                    i += 1

                if t[right_ts][0] <= float(timestamp_sec) <= t[right_ts][1]:
                    # Save the frame as an image
                    out_path = os.path.join(
                        output_dir,
                        f"frame_{float(t[right_ts][0])}_{float(t[right_ts][1])}_{round(float(timestamp_sec), 4)}.jpg"
                    )
                    frame.to_image().save(out_path, quality=80)

    except Exception as e:
        print(f"Error in {video_path}: {e}")

# Base folders
video_dir = "videos"
output_base = "keyframes"
os.makedirs(output_base, exist_ok=True)

processed_count = 0
missing_count = 0

for video_id, metadata in final_dataset_captions.items():
    video_path = os.path.join(video_dir, f"{video_id}.mp4")
    output_dir = os.path.join(output_base, video_id)
    t = final_dataset_captions[video_id]['segments']['timestamps']

    if not os.path.exists(video_path):
        video_url = top_videos[video_id]['url']
        print(f"[Download] {video_id} → {video_url}")
        download_video(video_url, video_path)

    if os.path.exists(video_path):
        print(f"[Processing] Extracting keyframes from: {video_id}")
        extract_segment_keyframes(video_path, output_dir, t)
        processed_count += 1
    else:
        print(f"[Missing] Could not find video after download: {video_id}")
        missing_count += 1

print("\nKeyframe extraction completed.")
print(f"    Processed videos: {processed_count}")
print(f"    Missing videos: {missing_count}")
print(f"    Keyframes saved in: {output_base}/<video_id>/")

### OpenSearch connection settings

In [None]:
#Connections to the Open Search Server
host = 'api.novasearch.org'
port = 443

user = 'user09'
password = 'grupo09fct'
index_name = user

Test if OpenSearch is up and running

In [None]:
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

if client.indices.exists(index_name):

    resp = client.indices.open(index = index_name)
    print(resp)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    settings = client.indices.get_settings(index = index_name)
    pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client.indices.get_mapping(index = index_name)
    pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client.count(index = index_name))
else:
    print("Index does not exist.")

In [None]:
client.indices.delete(index=index_name, ignore=[400, 404])

### Create the index mappings

In [None]:
index_body = {
    "settings": {
        "index": {
            "knn": True
        }
    },
    "mappings": {
        "properties": {
            "video_id": {"type": "keyword"},
            "start_timestamp": {"type": "text"},
            "end_timestamp": {"type": "text"},
            "caption": {"type": "text"},
            "caption_vector": {
                "type": "knn_vector",
                "dimension": 512,
                "method": {
                    "name": "hnsw",
                    "space_type": "innerproduct",
                    "engine": "faiss",
                    "parameters": {
                        "ef_construction": 256,
                        "m": 48
                    }
                }
            },
            "image_clip_vector": {
                "type": "knn_vector",
                "dimension": 512,
                "method": {
                    "name": "hnsw",
                    "space_type": "innerproduct",
                    "engine": "faiss",
                    "parameters": {
                        "ef_construction": 256,
                        "m": 48
                    }
                }
            }
        }
    }
}


if client.indices.exists(index=index_name):
    print("Index already existed. You may force the new mappings.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)

### Encode images and text using CLIP

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
def encode_image(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_embedding = model.encode_image(image)
        return image_embedding[0].cpu().numpy()
    
def encode_text(text):
    text_tokens = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_embedding = model.encode_text(text_tokens)
        return text_embedding[0].cpu().numpy()

In [None]:
def index_clip_data(video_id, start_timestamp, end_timestamp, caption, image_path):
    caption_vec = encode_text(caption).tolist()
    image_vec = encode_image(image_path).tolist()
    
    doc = {
        "video_id": video_id,
        "start_timestamp": start_timestamp,
        "end_timestamp": end_timestamp,
        "caption": caption,
        "caption_vector": caption_vec,
        "image_clip_vector": image_vec
    }
    
    client.index(index=index_name, body=doc)

### Index the images and captions

In [None]:
keyframes_root = Path("./keyframes")

for video_folder in keyframes_root.iterdir():
    video_id = video_folder.name

    for img in video_folder.glob("*.jpg"):
        filename_parts = img.stem.split("_")
        start_ts = float(filename_parts[1])
        end_ts = float(filename_parts[2])

        img_path = str(img)

        timestamp_array = final_dataset_captions[video_id]['segments']['timestamps']
        sentences_array = final_dataset_captions[video_id]['segments']['sentences']
        
        i = timestamp_array.index([start_ts, end_ts])

        sentence = sentences_array[i]

        index_clip_data(video_id, start_ts, end_ts, sentence, img_path)

        print(f"Indexed: {video_id} {img_path} {timestamp_array[i]} {sentences_array[i]}")

In [None]:
client.indices.refresh(index=index_name)