<img src="../DLSU-ALTDSI-logo.png" width="100%" style="margin-bottom:10px; margin-top:0px;"/>

**This notebook contains the context-aware video retrieval pipeline used in the study:**

## *Comparing Modality Representation Schemes in Video Retrieval for More Context-Aware Auto-Annotation of Trending Short-Form Videos*

**By the following researchers from the Andrew L. Tan Data Science Institute:**
1. Ong, Matthew Kristoffer Y. (matthew_kristoffer_ong@dlsu.edu.ph)
2. Presas, Shanette Giane G. (shanette_giane_presas@dlsu.edu.ph)
3. Sarreal, Sophia Althea R. (sophia_sarreal@dlsu.edu.ph)
4. To, Jersey Jaclyn K. (jers_to@dlsu.edu.ph)

---

Note to thesismates:
1. Navigate first to the similarity pipeline folder
2. Run this to activate venv for the terminal instance: .venv\Scripts\activate
3. NOTE: you will also need the ff files:
    1. 'class_labels_indices.csv'
    2. 'Cnn14_mAP=0.431.pth' (these are the model weights to be used) from https://zenodo.org/records/3987831
    3. This specific torchaudio/torchvision model: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

## Dependencies

In [1]:
import os
from pathlib import Path

# audio
import numpy as np
import matplotlib.pyplot as plt
import ffmpeg
import torch
import librosa
from panns_inference import AudioTagging

# visuals
import torchvision.models as models
import torchvision.transforms as transforms
import cv2
import argparse
from tqdm import tqdm
from PIL import Image
import time

#text
import easyocr
import pandas as pd
import re
import json
from collections import defaultdict
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import warnings
import string
from ftfy import fix_text
from wordsegment import load as ws_load, segment as ws_segment
from spellchecker import SpellChecker
from transformers import pipeline
import wordninja
from wordfreq import zipf_frequency
import glob
import pathlib
import subprocess
import sys
from faster_whisper import WhisperModel
from openai import OpenAI
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer

#similarity
from numpy.linalg import norm

#gemini
import getpass
from typing import List, Dict, Any
import google.generativeai as genai

import faulthandler
faulthandler.enable()

# Make sure cuda (gpu) is active!
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

warnings.filterwarnings('ignore')



Using device: cpu


---
## **AUDIO MODALITY**
**Goal**: Produce embeddings representing the audio modality of a given set of videos.

**Preprocessing step:** extracts 32kHz waveform files from the input videos.

In [2]:
def extract_audio_to_wavs(video_path: str, out32: str, overwrite: bool=True):
    extract_32k=(
        ffmpeg.input(video_path).output(out32, format='wav', acodec='pcm_s16le', ac=1, ar=32000)
    )
    if overwrite:
        extract_32k = extract_32k.overwrite_output()
    
    extract_32k.run(quiet=True)
    print("Wrote 32kHz", out32)

In [3]:
def process_video(video_path: str, out_dir: str ="proc_out"):
    out_dir = Path(out_dir)
    audio_dir = out_dir.parent / (out_dir.name + "_32kHz")
    audio_dir.mkdir(parents=True, exist_ok=True) # 32kHz goes to audio_dir

    video = Path(video_path)
    out32 = audio_dir / (video.stem + "_32k.wav") # 32kHz output

    # Extract audio
    extract_audio_to_wavs(str(video), str(out32))

In [None]:
media_dir = Path("media")
videos = list(media_dir.glob("*.mp4"))
print(f"{len(videos)} videos found!")

for video in videos:
    print(f"\nProcessing: {video.name}")
    process_video(video)

100 videos found!

Processing: trend10vid1.mp4
Wrote 32kHz proc_out_32kHz\trend10vid1_32k.wav

Processing: trend10vid10.mp4
Wrote 32kHz proc_out_32kHz\trend10vid10_32k.wav

Processing: trend10vid2.mp4
Wrote 32kHz proc_out_32kHz\trend10vid2_32k.wav

Processing: trend10vid3.mp4
Wrote 32kHz proc_out_32kHz\trend10vid3_32k.wav

Processing: trend10vid4.mp4
Wrote 32kHz proc_out_32kHz\trend10vid4_32k.wav

Processing: trend10vid5.mp4
Wrote 32kHz proc_out_32kHz\trend10vid5_32k.wav

Processing: trend10vid6.mp4
Wrote 32kHz proc_out_32kHz\trend10vid6_32k.wav

Processing: trend10vid7.mp4
Wrote 32kHz proc_out_32kHz\trend10vid7_32k.wav

Processing: trend10vid8.mp4
Wrote 32kHz proc_out_32kHz\trend10vid8_32k.wav

Processing: trend10vid9.mp4
Wrote 32kHz proc_out_32kHz\trend10vid9_32k.wav

Processing: trend1vid1.mp4
Wrote 32kHz proc_out_32kHz\trend1vid1_32k.wav

Processing: trend1vid10.mp4
Wrote 32kHz proc_out_32kHz\trend1vid10_32k.wav

Processing: trend1vid2.mp4
Wrote 32kHz proc_out_32kHz\trend1vid2_32k.

**Feature extraction step:** produces embeddings in the form of a 2048-dimensional feature vector representing the audio of the videos.

In [6]:
proc_out_32kHz_dir = Path("proc_out_32kHz")
emb_out_dir = Path("embeddings_out/audio2048") # 2048-d vectors go here
emb_out_dir.mkdir(parents=True, exist_ok=True)

at_model = AudioTagging(checkpoint_path=None, device=device) #this is the pretrained CNN14

wav_files = sorted(proc_out_32kHz_dir.glob("*_32k.wav"))
print(f"{len(wav_files)} WAV files found!")

for wav_path in wav_files:
    print(f"\nProcessing: {wav_path.name}")
    wav, sr = librosa.load(str(wav_path), sr=32000, mono=True) # just to make sure wav is 32kHz
    audio_batch = np.expand_dims(wav, axis=0) # matches the expected shape of PANN

    _, embedding = at_model.inference(audio_batch) # gets the embedding as numpy array

    embedding_vec = embedding[0] # first element of embedding array

    # just removing the "_32k" for filename consistency
    stem = wav_path.stem
    if stem.endswith("_32k"):
        stem = stem[:-4]

    out_path = emb_out_dir / f"{stem}_emb-audio2048.npy"
    np.save(str(out_path), embedding_vec)
    print("Embedding saved: ", out_path)

    print(embedding_vec) # if you want to see the vector
    print(embedding_vec.shape)

Checkpoint path: C:\Users\Shanette/panns_data/Cnn14_mAP=0.431.pth
Using CPU.
100 WAV files found!

Processing: trend10vid10_32k.wav
Embedding saved:  embeddings_out\audio2048\trend10vid10_emb-audio2048.npy
[0.         0.         0.         ... 0.03439935 0.         0.        ]
(2048,)

Processing: trend10vid1_32k.wav
Embedding saved:  embeddings_out\audio2048\trend10vid1_emb-audio2048.npy
[0.         0.         0.         ... 0.31364843 0.         0.        ]
(2048,)

Processing: trend10vid2_32k.wav
Embedding saved:  embeddings_out\audio2048\trend10vid2_emb-audio2048.npy
[0.         0.         0.         ... 0.3865799  0.04848313 0.        ]
(2048,)

Processing: trend10vid3_32k.wav
Embedding saved:  embeddings_out\audio2048\trend10vid3_emb-audio2048.npy
[0.         0.         0.         ... 0.24189433 0.14089212 0.        ]
(2048,)

Processing: trend10vid4_32k.wav
Embedding saved:  embeddings_out\audio2048\trend10vid4_emb-audio2048.npy
[0.         0.         0.         ... 0.08382463 0

---
## **VISUAL MODALITY**
**Goal**: Produce embeddings representing the visual modality of a given set of videos, a 2048-dimension vector that numerically represents its visual content (objects, scenes, textures).
**Model:** a pre-trained **ResNet-50** model.
**Output:** A `.npy` file for each video (e.g., `trend1vid1_emb-visual2048.npy`) saved to the `embeddings_out/video2048` folder.

This vector will serve as the **"visual"** component for our similarity retrieval pipeline.

**1. Configuration**

This cell sets the key parameters for the visual pipeline.

* `INPUT_DIR`: The local folder where the videos are stored.
* `OUTPUT_DIR`: The local folder where the finished embeddings will be saved.
* `FRAME_SAMPLE_RATE = 30`: Every nth frame processed (e.g., at 30fps, this is one frame per second). This is a trade-off between speed and accuracy.
* `BATCH_SIZE = 32`: For efficiency, we will feed frames to the GPU in batches of 32.

In [None]:
INPUT_DIR = Path("media")
OUTPUT_DIR = Path("embeddings_out/video2048")

FRAME_SAMPLE_RATE = 30
BATCH_SIZE = 32

VIDEO_EXTENSIONS = [".mp4", ".mov", ".avi", ".mkv", ".webm"]

**2. Model Definition: ResNet-50 as a Feature Extractor**
This function loads the ResNet-50 model, which is pre-trained on the ImageNet dataset.

The most important step is in this line:
`model = torch.nn.Sequential(*list(model.children())[:-1])`

This "decapitates" the model by **removing its final classification layer**. Instead of outputting a single word like "dog" or "cat", the model now outputs the **2048-dimension feature vector** from the second-to-last layer. This vector is the rich "visual fingerprint" we use for similarity.

In [12]:
def get_resnet_model(device: str):
    """Loads the pre-trained ResNet-50 model and its associated transforms."""
    weights = models.ResNet50_Weights.DEFAULT
    model = models.resnet50(weights=weights)
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model.eval()
    model.to(device)
    preprocess = weights.transforms()
    return model, preprocess

model, preprocess = get_resnet_model(device)

**3. Core Logic: The Video-to-Vector Function**

This function, `extract_resnet_embeddings`, contains the core logic for processing a single video file. It does three things:

1.  **Sampling:** It opens the video with `cv2.VideoCapture` and loops through it, grabbing one frame every `FRAME_SAMPLE_RATE`.
2.  **Batching:** It collects these frames into a `frame_batch`. When the batch is full (`len(frame_batch) == batch_size`), it stacks them into a single tensor and sends them to the GPU. This is much faster than processing frames one by one.
3.  **Aggregation (Mean Pooling):** After all frames are processed, the function has many 2048-dim vectors. It calculates the **average** (`np.mean`) of all these vectors to create *one single vector* that represents the entire video.

In [14]:
def extract_resnet_embeddings(
    video_path: Path, 
    model, 
    preprocess, 
    device: str, 
    frame_sample_rate: int = 30, 
    batch_size: int = 32
) -> np.ndarray:
    if not video_path.exists():
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise IOError(f"Cannot open video file: {video_path}")

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    all_features = []
    frame_batch = []
    frame_idx = 0
    
    pbar = tqdm(total=frame_count, desc=f"Frames for {video_path.name}", leave=True, disable=True)

    with torch.no_grad():
        while True:
            ret, frame = cap.read()
            if not ret: break
            pbar.update(1)
            
            if frame_idx % frame_sample_rate == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(frame_rgb)
                frame_batch.append(pil_img)

                if len(frame_batch) == batch_size:
                    image_inputs = torch.stack(
                        [preprocess(img) for img in frame_batch]
                    ).to(device)
                    image_features = model(image_inputs)
                    all_features.append(image_features.squeeze().cpu().numpy())
                    frame_batch = []
            frame_idx += 1
        
        if frame_batch:
            image_inputs = torch.stack(
                [preprocess(img) for img in frame_batch]
            ).to(device)
            image_features = model(image_inputs)
            all_features.append(image_features.squeeze().cpu().numpy())

    cap.release()
    pbar.close()
    if not all_features:
        raise ValueError(f"No frames sampled for {video_path.name}")

    embeddings = np.vstack(all_features)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

In [15]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reading videos from: {INPUT_DIR.resolve()}")
print(f"Saving embeddings to: {OUTPUT_DIR.resolve()}")

video_files = []
for ext in VIDEO_EXTENSIONS:
    video_files.extend(INPUT_DIR.glob(f"*{ext}"))
print(f"Found {len(video_files)} videos.")

existing_embeddings = {f.name for f in OUTPUT_DIR.glob('*_resnet.npy')}
print(f"Found {len(existing_embeddings)} existing ResNet embeddings.")

for video_path in tqdm(video_files, desc="Processing Videos (ResNet)"):
    output_filename = f"{video_path.stem}_emb-visual2048.npy"

    if output_filename in existing_embeddings:
        continue
    
    output_path = OUTPUT_DIR / output_filename
    
    try:
        print(f"Processing {video_path.name}...")
        mean_embedding = extract_resnet_embeddings(
            video_path=video_path,
            model=model,
            preprocess=preprocess,
            device=device,
            frame_sample_rate=FRAME_SAMPLE_RATE,
            batch_size=BATCH_SIZE
        )
        np.save(output_path, mean_embedding)

    except Exception as e:
        print(f"\n[ERROR] Failed to process {video_path.name}: {e}")

print("\n--- Batch processing complete. ---")

Reading videos from: C:\Users\Shanette\Downloads\COLLEGE\CSST Y4-T2\THS-ST2\context-aware-video-retrieval\similarity pipeline\media full
Saving embeddings to: C:\Users\Shanette\Downloads\COLLEGE\CSST Y4-T2\THS-ST2\context-aware-video-retrieval\similarity pipeline\embeddings_out\video2048
Found 100 videos.
Found 0 existing ResNet embeddings.


Processing Videos (ResNet):   0%|          | 0/100 [00:00<?, ?it/s]

Processing trend10vid1.mp4...


Processing Videos (ResNet):   1%|          | 1/100 [00:06<10:08,  6.14s/it]

Processing trend10vid10.mp4...


Processing Videos (ResNet):   2%|▏         | 2/100 [00:16<14:24,  8.83s/it]

Processing trend10vid2.mp4...


Processing Videos (ResNet):   3%|▎         | 3/100 [00:21<11:04,  6.85s/it]

Processing trend10vid3.mp4...


Processing Videos (ResNet):   4%|▍         | 4/100 [00:30<12:22,  7.73s/it]

Processing trend10vid4.mp4...


Processing Videos (ResNet):   5%|▌         | 5/100 [00:35<10:56,  6.91s/it]

Processing trend10vid5.mp4...


Processing Videos (ResNet):   6%|▌         | 6/100 [00:42<10:49,  6.91s/it]

Processing trend10vid6.mp4...


Processing Videos (ResNet):   7%|▋         | 7/100 [00:49<10:48,  6.98s/it]

Processing trend10vid7.mp4...


Processing Videos (ResNet):   8%|▊         | 8/100 [01:02<13:39,  8.91s/it]

Processing trend10vid8.mp4...


Processing Videos (ResNet):   9%|▉         | 9/100 [01:08<11:50,  7.81s/it]

Processing trend10vid9.mp4...


Processing Videos (ResNet):  10%|█         | 10/100 [01:14<10:43,  7.15s/it]

Processing trend1vid1.mp4...


Processing Videos (ResNet):  11%|█         | 11/100 [01:15<08:09,  5.50s/it]

Processing trend1vid10.mp4...


Processing Videos (ResNet):  12%|█▏        | 12/100 [01:26<10:22,  7.08s/it]

Processing trend1vid2.mp4...


Processing Videos (ResNet):  13%|█▎        | 13/100 [01:39<13:00,  8.98s/it]

Processing trend1vid3.mp4...


Processing Videos (ResNet):  14%|█▍        | 14/100 [01:42<10:10,  7.10s/it]

Processing trend1vid4.mp4...


Processing Videos (ResNet):  15%|█▌        | 15/100 [01:44<07:47,  5.50s/it]

Processing trend1vid5.mp4...


Processing Videos (ResNet):  16%|█▌        | 16/100 [01:45<05:58,  4.27s/it]

Processing trend1vid6.mp4...


Processing Videos (ResNet):  17%|█▋        | 17/100 [01:47<04:41,  3.39s/it]

Processing trend1vid7.mp4...


Processing Videos (ResNet):  18%|█▊        | 18/100 [01:48<03:36,  2.64s/it]

Processing trend1vid8.mp4...


Processing Videos (ResNet):  19%|█▉        | 19/100 [01:49<03:09,  2.34s/it]

Processing trend1vid9.mp4...


Processing Videos (ResNet):  20%|██        | 20/100 [01:55<04:32,  3.41s/it]

Processing trend2vid1.mp4...


Processing Videos (ResNet):  21%|██        | 21/100 [01:59<04:36,  3.51s/it]

Processing trend2vid10.mp4...


Processing Videos (ResNet):  22%|██▏       | 22/100 [02:00<03:32,  2.72s/it]

Processing trend2vid2.mp4...


Processing Videos (ResNet):  23%|██▎       | 23/100 [02:01<02:48,  2.19s/it]

Processing trend2vid3.mp4...


Processing Videos (ResNet):  24%|██▍       | 24/100 [02:02<02:33,  2.02s/it]

Processing trend2vid4.mp4...


Processing Videos (ResNet):  25%|██▌       | 25/100 [02:03<02:03,  1.64s/it]

Processing trend2vid5.mp4...


Processing Videos (ResNet):  26%|██▌       | 26/100 [02:04<01:51,  1.51s/it]

Processing trend2vid6.mp4...


Processing Videos (ResNet):  27%|██▋       | 27/100 [02:05<01:42,  1.40s/it]

Processing trend2vid7.mp4...


Processing Videos (ResNet):  28%|██▊       | 28/100 [02:07<01:35,  1.33s/it]

Processing trend2vid8.mp4...


Processing Videos (ResNet):  29%|██▉       | 29/100 [02:08<01:31,  1.29s/it]

Processing trend2vid9.mp4...


Processing Videos (ResNet):  30%|███       | 30/100 [02:17<04:09,  3.56s/it]

Processing trend3vid1.mp4...


Processing Videos (ResNet):  31%|███       | 31/100 [02:18<03:22,  2.94s/it]

Processing trend3vid10.mp4...


Processing Videos (ResNet):  32%|███▏      | 32/100 [02:35<08:14,  7.27s/it]

Processing trend3vid2.mp4...


Processing Videos (ResNet):  33%|███▎      | 33/100 [02:37<06:03,  5.43s/it]

Processing trend3vid3.mp4...


Processing Videos (ResNet):  34%|███▍      | 34/100 [02:39<05:01,  4.58s/it]

Processing trend3vid4.mp4...


Processing Videos (ResNet):  35%|███▌      | 35/100 [02:40<03:46,  3.49s/it]

Processing trend3vid5.mp4...


Processing Videos (ResNet):  36%|███▌      | 36/100 [02:42<03:20,  3.14s/it]

Processing trend3vid6.mp4...


Processing Videos (ResNet):  37%|███▋      | 37/100 [02:43<02:36,  2.49s/it]

Processing trend3vid7.mp4...


Processing Videos (ResNet):  38%|███▊      | 38/100 [02:46<02:28,  2.40s/it]

Processing trend3vid8.mp4...


Processing Videos (ResNet):  39%|███▉      | 39/100 [02:46<01:58,  1.94s/it]

Processing trend3vid9.mp4...


Processing Videos (ResNet):  40%|████      | 40/100 [02:48<01:50,  1.84s/it]

Processing trend4vid1.mp4...


Processing Videos (ResNet):  41%|████      | 41/100 [02:51<02:16,  2.31s/it]

Processing trend4vid10.mp4...


Processing Videos (ResNet):  42%|████▏     | 42/100 [03:02<04:31,  4.69s/it]

Processing trend4vid2.mp4...


Processing Videos (ResNet):  43%|████▎     | 43/100 [03:06<04:20,  4.57s/it]

Processing trend4vid3.mp4...


Processing Videos (ResNet):  44%|████▍     | 44/100 [03:08<03:39,  3.92s/it]

Processing trend4vid4.mp4...


Processing Videos (ResNet):  45%|████▌     | 45/100 [03:12<03:22,  3.69s/it]

Processing trend4vid5.mp4...


Processing Videos (ResNet):  46%|████▌     | 46/100 [03:14<02:51,  3.17s/it]

Processing trend4vid6.mp4...


Processing Videos (ResNet):  47%|████▋     | 47/100 [03:15<02:23,  2.71s/it]

Processing trend4vid7.mp4...


Processing Videos (ResNet):  48%|████▊     | 48/100 [03:20<02:53,  3.33s/it]

Processing trend4vid8.mp4...


Processing Videos (ResNet):  49%|████▉     | 49/100 [03:23<02:49,  3.33s/it]

Processing trend4vid9.mp4...


Processing Videos (ResNet):  50%|█████     | 50/100 [03:39<05:49,  6.98s/it]

Processing trend5vid1.mp4...


Processing Videos (ResNet):  51%|█████     | 51/100 [03:40<04:17,  5.26s/it]

Processing trend5vid10.mp4...


Processing Videos (ResNet):  52%|█████▏    | 52/100 [03:44<03:51,  4.82s/it]

Processing trend5vid2.mp4...


Processing Videos (ResNet):  53%|█████▎    | 53/100 [03:45<02:56,  3.75s/it]

Processing trend5vid3.mp4...


Processing Videos (ResNet):  54%|█████▍    | 54/100 [03:46<02:17,  3.00s/it]

Processing trend5vid4.mp4...


Processing Videos (ResNet):  55%|█████▌    | 55/100 [03:50<02:23,  3.19s/it]

Processing trend5vid5.mp4...


Processing Videos (ResNet):  56%|█████▌    | 56/100 [03:51<01:55,  2.63s/it]

Processing trend5vid6.mp4...


Processing Videos (ResNet):  57%|█████▋    | 57/100 [03:52<01:28,  2.06s/it]

Processing trend5vid7.mp4...


Processing Videos (ResNet):  58%|█████▊    | 58/100 [03:53<01:10,  1.67s/it]

Processing trend5vid8.mp4...


Processing Videos (ResNet):  59%|█████▉    | 59/100 [03:53<00:56,  1.39s/it]

Processing trend5vid9.mp4...


Processing Videos (ResNet):  60%|██████    | 60/100 [03:54<00:47,  1.19s/it]

Processing trend6vid1.mp4...


Processing Videos (ResNet):  61%|██████    | 61/100 [03:55<00:41,  1.06s/it]

Processing trend6vid10.mp4...


Processing Videos (ResNet):  62%|██████▏   | 62/100 [03:58<01:02,  1.66s/it]

Processing trend6vid2.mp4...


Processing Videos (ResNet):  63%|██████▎   | 63/100 [04:00<01:01,  1.65s/it]

Processing trend6vid3.mp4...


Processing Videos (ResNet):  64%|██████▍   | 64/100 [04:00<00:49,  1.39s/it]

Processing trend6vid4.mp4...


Processing Videos (ResNet):  65%|██████▌   | 65/100 [04:03<00:56,  1.62s/it]

Processing trend6vid5.mp4...


Processing Videos (ResNet):  66%|██████▌   | 66/100 [04:04<00:55,  1.65s/it]

Processing trend6vid6.mp4...


Processing Videos (ResNet):  67%|██████▋   | 67/100 [04:06<00:57,  1.75s/it]

Processing trend6vid7.mp4...


Processing Videos (ResNet):  68%|██████▊   | 68/100 [04:07<00:50,  1.57s/it]

Processing trend6vid8.mp4...


Processing Videos (ResNet):  69%|██████▉   | 69/100 [04:10<00:54,  1.77s/it]

Processing trend6vid9.mp4...


Processing Videos (ResNet):  70%|███████   | 70/100 [04:12<00:54,  1.82s/it]

Processing trend7vid1.mp4...


Processing Videos (ResNet):  71%|███████   | 71/100 [04:13<00:45,  1.56s/it]

Processing trend7vid10.mp4...


Processing Videos (ResNet):  72%|███████▏  | 72/100 [04:13<00:38,  1.36s/it]

Processing trend7vid2.mp4...


Processing Videos (ResNet):  73%|███████▎  | 73/100 [04:16<00:45,  1.70s/it]

Processing trend7vid3.mp4...


Processing Videos (ResNet):  74%|███████▍  | 74/100 [04:17<00:42,  1.64s/it]

Processing trend7vid4.mp4...


Processing Videos (ResNet):  75%|███████▌  | 75/100 [04:19<00:38,  1.54s/it]

Processing trend7vid5.mp4...


Processing Videos (ResNet):  76%|███████▌  | 76/100 [04:21<00:42,  1.76s/it]

Processing trend7vid6.mp4...


Processing Videos (ResNet):  77%|███████▋  | 77/100 [04:23<00:43,  1.91s/it]

Processing trend7vid7.mp4...


Processing Videos (ResNet):  78%|███████▊  | 78/100 [04:25<00:38,  1.76s/it]

Processing trend7vid8.mp4...


Processing Videos (ResNet):  79%|███████▉  | 79/100 [04:26<00:32,  1.55s/it]

Processing trend7vid9.mp4...


Processing Videos (ResNet):  80%|████████  | 80/100 [04:28<00:33,  1.66s/it]

Processing trend8vid1.mp4...


Processing Videos (ResNet):  81%|████████  | 81/100 [04:29<00:28,  1.51s/it]

Processing trend8vid10.mp4...


Processing Videos (ResNet):  82%|████████▏ | 82/100 [04:31<00:32,  1.83s/it]

Processing trend8vid2.mp4...


Processing Videos (ResNet):  83%|████████▎ | 83/100 [04:33<00:29,  1.74s/it]

Processing trend8vid3.mp4...


Processing Videos (ResNet):  84%|████████▍ | 84/100 [04:34<00:25,  1.58s/it]

Processing trend8vid4.mp4...


Processing Videos (ResNet):  85%|████████▌ | 85/100 [04:37<00:29,  1.96s/it]

Processing trend8vid5.mp4...


Processing Videos (ResNet):  86%|████████▌ | 86/100 [04:40<00:33,  2.40s/it]

Processing trend8vid6.mp4...


Processing Videos (ResNet):  87%|████████▋ | 87/100 [04:42<00:26,  2.02s/it]

Processing trend8vid7.mp4...


Processing Videos (ResNet):  88%|████████▊ | 88/100 [04:43<00:20,  1.70s/it]

Processing trend8vid8.mp4...


Processing Videos (ResNet):  89%|████████▉ | 89/100 [04:45<00:21,  1.92s/it]

Processing trend8vid9.mp4...


Processing Videos (ResNet):  90%|█████████ | 90/100 [04:48<00:22,  2.25s/it]

Processing trend9vid1.mp4...


Processing Videos (ResNet):  91%|█████████ | 91/100 [04:50<00:18,  2.07s/it]

Processing trend9vid10.mp4...


Processing Videos (ResNet):  92%|█████████▏| 92/100 [04:51<00:13,  1.74s/it]

Processing trend9vid2.mp4...


Processing Videos (ResNet):  93%|█████████▎| 93/100 [04:53<00:12,  1.82s/it]

Processing trend9vid3.mp4...


Processing Videos (ResNet):  94%|█████████▍| 94/100 [04:55<00:11,  1.88s/it]

Processing trend9vid4.mp4...


Processing Videos (ResNet):  95%|█████████▌| 95/100 [04:55<00:07,  1.50s/it]

Processing trend9vid5.mp4...


Processing Videos (ResNet):  96%|█████████▌| 96/100 [04:57<00:06,  1.72s/it]

Processing trend9vid6.mp4...


Processing Videos (ResNet):  97%|█████████▋| 97/100 [04:58<00:04,  1.51s/it]

Processing trend9vid7.mp4...


Processing Videos (ResNet):  98%|█████████▊| 98/100 [05:02<00:04,  2.06s/it]

Processing trend9vid8.mp4...


Processing Videos (ResNet):  99%|█████████▉| 99/100 [05:03<00:01,  1.83s/it]

Processing trend9vid9.mp4...


Processing Videos (ResNet): 100%|██████████| 100/100 [05:05<00:00,  3.05s/it]


--- Batch processing complete. ---





---
## **TEXT MODALITY**
**Goal**: Produce embeddings representing the text modality of a given set of videos.

In [27]:
OUTPUT_CSV = "video_text_outputs.csv"
WHISPER_MODEL = WhisperModel("base", device="cuda" if torch.cuda.is_available() else "cpu",
                    compute_type="int8_float16" if torch.cuda.is_available() else "int8")

try:
    OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API Key: ")
except ImportError:
    OPENAI_API_KEY = "PASTE_YOUR_OPENAI_KEY_HERE"
except Exception as e:
    print(f"Error getting API key: {e}")
    OPENAI_API_KEY = ""

if not OPENAI_API_KEY or OPENAI_API_KEY == "":
    print("Warning: OpenAI API Key is not set. OCR cleaning (Step 3) will fail.")
else:
    print("OpenAI API Key received.")

OpenAI API Key received.


In [28]:
def _list_local_videos(root_dir):
    exts = ('.mp4', '.mov', '.m4v', '.mkv', '.avi', '.webm')
    paths = []
    for ext in exts:
        paths.extend(glob.glob(os.path.join(root_dir, f"**/*{ext}"), recursive=True))
    return sorted(paths)

def _fetch_videos_from_folder(folder_path):
    if not folder_path or folder_path.strip() == "":
        raise ValueError("Folder path is empty.")

    if os.path.isdir(folder_path):
        return _list_local_videos(folder_path)

    raise ValueError(f"Not a valid local directory: {folder_path}")

### Automatic Speech Recognition (ASR) Extraction  
This cell defines the **audio transcription stage** using the `faster_whisper` model.  
It loads a lightweight Whisper model (`base`) with GPU acceleration if available, and transcribes speech segments into text.  
  
All transcribed text segments are concatenated into one clean string, which becomes the `ASR` output for that video.  
This step ensures spoken content is captured in parallel with on-screen text for multimodal fusion.


In [29]:
def extract_audio_with_whisper(video_path):
    try:
        segments, _ = WHISPER_MODEL.transcribe(video_path, beam_size=1)
        return " ".join(s.text for s in segments).strip()
    except Exception as e:
        print(f"ASR Error: {e}")
        return ""

### Text Validation and Frame Preprocessing for OCR  
This cell prepares each video frame for optimal text detection.  
It defines a validation function that filters out short or meaningless OCR detections, ensuring only text-like content is retained.  
`preprocess_frame_for_ocr()` enhances frames using:
1. **Grayscale conversion** for consistency  
2. **CLAHE (Contrast Limited Adaptive Histogram Equalization)** to amplify local text contrast  
3. **Gaussian blur** to reduce noise  
4. **Sharpening filter** to reinforce text edges  
This preprocessing pipeline improves EasyOCR accuracy for low-contrast overlays typical in short-form videos.


### Optical Character Recognition (OCR) from Video Frames  
This cell also handles the frame-wise extraction of visible text from video overlays.  
It uses **OpenCV** to read frames and **EasyOCR** for text detection and recognition.  
  
For each detected text region, the function records the recognized text and timestamps and positions (up to five samples each per unique text).
 
All detections are aggregated into a structured dictionary keyed by unique text phrases.  
This creates a high-resolution temporal map of textual elements appearing throughout the video.


In [30]:
def is_valid_text(text):
    if not text or len(text.strip()) < 2:
        return False
    clean = re.sub(r'[^\w#@]', '', text)
    return len(clean) > 0

def preprocess_frame_for_ocr(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    denoised = cv2.GaussianBlur(enhanced, (3, 3), 0)
    kernel = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]])
    sharpened = cv2.filter2D(denoised, -1, kernel)
    return sharpened

def extract_ocr_from_video(video_path, sample_rate_fps=1):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video {video_path}")
        return {}

    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    frame_interval = max(1, int(round(fps / max(0.1, sample_rate_fps))))

    reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
    text_detections = defaultdict(lambda: {'count': 0, 'timestamps': [], 'positions': []})

    print("Processing video frames for OCR...")
    processed = 0

    for frame_idx in range(0, total_frames, frame_interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret or frame is None:
            continue

        h, w = frame.shape[:2]
        max_w = 960
        if w > max_w:
            scale = max_w / float(w)
            frame = cv2.resize(frame, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)

        timestamp = frame_idx / fps

        try:
            frame_pp = preprocess_frame_for_ocr(frame)
            results = reader.readtext(frame_pp, detail=1, paragraph=False)
            for (bbox, text, confidence) in results:
                if confidence > 0.5 and is_valid_text(text):
                    xs = [p[0] for p in bbox]
                    ys = [p[1] for p in bbox]
                    x_left = float(min(xs))
                    y_top = float(min(ys))

                    text_detections[text]['count'] += 1
                    if len(text_detections[text]['timestamps']) < 5:
                        text_detections[text]['timestamps'].append(round(timestamp, 2))
                    if len(text_detections[text]['positions']) < 5:
                        text_detections[text]['positions'].append((round(y_top, 2), round(x_left, 2)))

        except Exception as e:
            print(f"OCR error at frame {frame_idx}: {e}")

        processed += 1
        if processed % 10 == 0:
            print(f"Processed {processed} sampled frames...")

    cap.release()
    print(f"OCR processing complete. Found {len(text_detections)} unique text phrases.")
    return dict(text_detections)

### Text Cleaning and OCR Correction  
This cell defines the **OCR post-processing function** used to clean noisy text extracted from video frames.  
It uses OpenAI’s GPT-4o-mini model to fix common OCR errors such as spacing, casing, and misread characters (`v`→`y`, `rn`→`m`, etc.).  
Corrections are strictly constrained — the function does **not** rephrase or alter meaning, ensuring all text remains faithful to the original overlay.  
Progress is logged, cost is tracked, and fallback behavior ensures the pipeline never breaks if API calls fail.


In [31]:
def clean_ocr_with_openai(ocr_phrases, api_key, model="gpt-4o-mini"):
    phrases = list(ocr_phrases.keys())
    if not phrases:
        return []

    print(f"Cleaning {len(phrases)} OCR phrases with {model}...")
    client = OpenAI(api_key=api_key)

    system_prompt = """You are an OCR error correction assistant. Fix only obvious OCR mistakes.

Common OCR errors:
- Character confusion: 'v' → 'y', 'rn' → 'm', '0' → 'O', 'i' → 'l', 'vv' → 'w', '@' → 'a', '@' → 'o'
- Missing spaces: 'helloworld' → 'hello world'
- Extra spaces: 'hel lo' → 'hello'

Rules:
1. ONLY fix clear OCR errors - do not rephrase or change meaning
2. Preserve hashtags (#) exactly
3. Keep original capitalization only for proper nouns, otherwise make everything lowercase.
4. Output ONLY the corrected text (no quotes, explanations, or extra words)
5. If a word has the letter 'v' in it and it looks misspelled, try swapping the 'v' with a 'y' to see if it makes more sense, and vice versa.
6. If a word other than "I" has the letter 'i' in it and it looks misspelled, try swapping the 'i' with a 'l' to see if it makes more sense, and vice versa.
7. Unless an acronym makes sense in the context, make it lowercase."""

    cleaned = []
    total_cost = 0.0

    for i, phrase in enumerate(phrases):
        if i % 10 == 0 and i > 0:
            print(f"  Cleaned {i}/{len(phrases)} (Cost: ${total_cost:.4f})")

        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Fix OCR errors: {phrase}"}
                ],
                temperature=0,
                max_tokens=100
            )

            result = response.choices[0].message.content.strip()

            # Track cost
            if hasattr(response, 'usage') and response.usage:
                input_tok = response.usage.prompt_tokens or 0
                output_tok = response.usage.completion_tokens or 0
                total_cost += (input_tok * 0.15 + output_tok * 0.60) / 1_000_000

            # Remove quotes if added
            if (result.startswith('"') and result.endswith('"')) or \
               (result.startswith("'") and result.endswith("'")):
                result = result[1:-1]

            # Fallback if result is empty or way too different
            if not result or len(result) > len(phrase) * 3:
                result = phrase

            cleaned.append(result.strip())

        except Exception as e:
            print(f"  Error cleaning '{phrase}': {e}")
            cleaned.append(phrase)

    print(f"Cleaning complete! Total cost: ${total_cost:.4f}")
    return cleaned

### Deduplication and Phrase Merging  
This cell merges redundant or near-duplicate OCR fragments produced across video frames.  
It compares text using normalized string similarity (SequenceMatcher) and merges phrases that are visually or semantically similar.  
Metadata such as occurrence counts, timestamps, and bounding box positions are aggregated.  
This step ensures each distinct caption fragment appears once, forming a clean set of unique textual units per video.


In [32]:
def normalize_text(text):
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s#@]', '', text)  # Keep hashtags and mentions
    return re.sub(r'\s+', ' ', text.strip())

def text_similarity(s1, s2):
    return SequenceMatcher(None, normalize_text(s1), normalize_text(s2)).ratio()

def smart_deduplicate_and_merge(ocr_data, cleaned_phrases):
    # Create phrase objects with metadata
    phrases = []
    for orig, clean in zip(ocr_data.keys(), cleaned_phrases):
        data = ocr_data[orig]
        phrases.append({
            'original': orig,
            'clean': clean,
            'normalized': normalize_text(clean),
            'count': data['count'],
            'timestamps': data.get('timestamps', []),
            'positions': data.get('positions', [])
        })

    # Sort by frequency (most common first) and timestamp (earliest first)
    phrases.sort(key=lambda x: (-x['count'], min(x['timestamps']) if x['timestamps'] else float('inf')))

    merged = []
    skip_indices = set()

    for i, phrase1 in enumerate(phrases):
        if i in skip_indices:
            continue

        # Start with this phrase as the canonical version
        canonical = phrase1.copy()

        # Check against remaining phrases
        for j in range(i + 1, len(phrases)):
            if j in skip_indices:
                continue

            phrase2 = phrases[j]

            # Calculate similarity
            similarity = text_similarity(phrase1['clean'], phrase2['clean'])

            # Merge if very similar (likely same text with OCR errors)
            if similarity > 0.85:
                # Choose the better version (longer, more common, or earlier)
                if len(phrase2['clean']) > len(canonical['clean']):
                    canonical['clean'] = phrase2['clean']
                    canonical['normalized'] = phrase2['normalized']

                # Merge metadata
                canonical['count'] += phrase2['count']
                canonical['timestamps'].extend(phrase2['timestamps'])
                canonical['positions'].extend(phrase2['positions'])

                skip_indices.add(j)

            # Check if one is substring of another
            elif canonical['normalized'] in phrase2['normalized']:
                # phrase1 is substring of phrase2, keep phrase2's text
                canonical['clean'] = phrase2['clean']
                canonical['normalized'] = phrase2['normalized']
                canonical['count'] += phrase2['count']
                canonical['timestamps'].extend(phrase2['timestamps'])
                canonical['positions'].extend(phrase2['positions'])
                skip_indices.add(j)

            elif phrase2['normalized'] in canonical['normalized']:
                # phrase2 is substring of phrase1, keep canonical and merge counts
                canonical['count'] += phrase2['count']
                canonical['timestamps'].extend(phrase2['timestamps'])
                canonical['positions'].extend(phrase2['positions'])
                skip_indices.add(j)

        # Clean up merged data
        canonical['timestamps'] = sorted(set(canonical['timestamps']))[:10]
        canonical['positions'] = list(set(map(tuple, canonical['positions'])))[:10]

        merged.append(canonical)

    return merged

### Final Assembly of OCR Fragments  
This cell converts all cleaned and merged OCR phrases into a single **chronologically ordered text line** per video.  
The fragments are first arranged by their earliest timestamps and concatenated.  
Then, an LLM-based refinement (GPT-4o-mini) attempts to minimally join the pieces into a coherent but faithful sentence — no paraphrasing or addition of new words.  
If the LLM output fails validation (too long or dissimilar), the code automatically falls back to the raw chronological assembly.


In [33]:
def assemble_final_text(merged_phrases, api_key, model="gpt-4o-mini"):
    if not merged_phrases:
        return ""

    # Sort by timestamp (chronological order)
    sorted_phrases = sorted(merged_phrases,
                           key=lambda x: min(x['timestamps']) if x['timestamps'] else float('inf'))

    # Simple fallback assembly
    simple_assembly = " ".join(p['clean'] for p in sorted_phrases)

    # Try LLM assembly for better coherence
    try:
        client = OpenAI(api_key=api_key)

        phrases_list = [p['clean'] for p in sorted_phrases]

        system_prompt = """You are a Gen-Z person familiar with Tiktok culture assembling OCR text fragments into one coherent sentence or phrase.

Rules:
1. Arrange the fragments by timestamp; if it doesn't make sense, then you can rearrange it minimally.
2. Remove duplicate or very similar fragments
3. Add minimal punctuation ONLY where clearly needed
4. Do NOT add new words or rephrase or change existing words
5. Preserve all hashtags
6. Output ONE clean line of text"""

        user_prompt = f"""Assemble these OCR fragments in order into one coherent line:

{chr(10).join(f'{i+1}. {p}' for i, p in enumerate(phrases_list))}

Assembled text:"""

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0,
            max_tokens=200
        )

        result = response.choices[0].message.content.strip()

        # Remove quotes if present
        if (result.startswith('"') and result.endswith('"')) or \
           (result.startswith("'") and result.endswith("'")):
            result = result[1:-1]

        # Validate result isn't too different from source material
        if result and len(result) > 10 and len(result) < len(simple_assembly) * 2:
            print("Using LLM-assembled text")
            return result
        else:
            print("LLM assembly invalid, using simple assembly")
            return simple_assembly

    except Exception as e:
        print(f"LLM assembly failed ({e}), using simple assembly")
        return simple_assembly
    
def create_output_csv(asr_text, merged_phrases, final_text, output_csv):
    rows = []

    # Add ASR
    if asr_text:
        rows.append({
            "source": "ASR",
            "text": asr_text,
            "count": 1,
            "timestamps": "[]",
            "original_text": ""
        })

    # Add individual OCR phrases
    for phrase in merged_phrases:
        rows.append({
            "source": "OCR_PHRASE",
            "text": phrase['clean'],
            "count": phrase['count'],
            "timestamps": json.dumps(phrase['timestamps'][:5]),
            "original_text": phrase['original']
        })

    # Add final assembled text
    if final_text:
        rows.append({
            "source": "OCR_FINAL",
            "text": final_text,
            "count": sum(p['count'] for p in merged_phrases),
            "timestamps": "[]",
            "original_text": ""
        })

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    return df

In [None]:
def _final_cleaned_phrase_list(merged_phrases):
    # Order by earliest timestamp first
    ordered = sorted(
        merged_phrases,
        key=lambda x: min(x['timestamps']) if x.get('timestamps') else float('inf')
    )

    seen = set()
    out = []
    for p in ordered:
        key = normalize_text(p.get('clean', ''))
        if key and key not in seen:
            out.append(p['clean'])
            seen.add(key)
    return out

def _process_one_video(video_path):
    print(f"\n==============================")
    print(f"Processing video: {video_path}")
    print(f"==============================\n")

    # === STEP 1: ASR ===
    print("=== STEP 1: Audio Transcription ===")
    asr_text = extract_audio_with_whisper(video_path)
    print(f"ASR Result: {asr_text[:200]}{'...' if len(asr_text) > 200 else ''}\n")

    # === STEP 2: OCR ===
    print("=== STEP 2: OCR Extraction ===")
    ocr_data = extract_ocr_from_video(video_path, sample_rate_fps=1)
    print(f"Extracted {len(ocr_data)} unique text phrases\n")

    # === STEP 3: Clean OCR ===
    print("=== STEP 3: OCR Cleaning ===")
    cleaned_phrases = clean_ocr_with_openai(ocr_data, OPENAI_API_KEY)

    print("\nOCR Corrections (sample):")
    for orig, clean in list(zip(ocr_data.keys(), cleaned_phrases))[:10]:
        if orig != clean:
            print(f"  ✓ '{orig}' → '{clean}'")
    print()

    # === STEP 4: Dedup & Merge ===
    print("=== STEP 4: Deduplication & Merging ===")
    merged_phrases = smart_deduplicate_and_merge(ocr_data, cleaned_phrases)
    print(f"Consolidated to {len(merged_phrases)} unique phrases\n")

    # === STEP 5: Final Assembly ===
    print("=== STEP 5: Final Text Assembly ===")
    final_text = assemble_final_text(merged_phrases, OPENAI_API_KEY)
    print(f"Final Text: {final_text}\n")

    return asr_text, merged_phrases, final_text

def main():
    print("Discovering videos...\n")

    MEDIA_FOLDER = r"./media"

    video_paths = _fetch_videos_from_folder(MEDIA_FOLDER)
    if not video_paths:
        print("No videos found. Please check the media folder")
        return

    print(f"Found {len(video_paths)} video(s).")
    for v in video_paths:
        print(" -", v)
    print()

    rows = []
    for vp in video_paths:
        asr_text, merged_phrases, final_text = _process_one_video(vp)
        phrases_list = _final_cleaned_phrase_list(merged_phrases)  # simple list of final cleaned phrases

        rows.append({
            "video": os.path.basename(vp),
            "asr": asr_text,
            "ocr_final": final_text,
            "cleaned_phrases": json.dumps(phrases_list, ensure_ascii=False)
        })

    df = pd.DataFrame(rows, columns=["video", "asr", "ocr_final", "cleaned_phrases"])
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n✓ Batch results saved to: {OUTPUT_CSV}")
    print(f"Total videos processed: {len(df)}")
    return df

# if __name__ == "__main__":
#     result_df = main()

### Objective Quality Scoring of Assembled Text  
This cell introduces a **quantitative metric** to evaluate how meaningful each video’s `ocr_final` text is.  
The score integrates three sub-factors:
- **Completeness** – Does the text have enough tokens (up to 25)?  
- **Language-likeness** – Are the tokens real, frequent English words (Zipf ≥ 2.5)?  
- **Diversity** – How varied are the tokens (unique / total)?  
A weighted sum (0.2 · C + 0.6 · L + 0.2 · D) × 100 yields a normalized 0–100 score, enabling automated quality control over OCR outputs.

The quality score, along with the final ASR, final OCR string, cleaned phrases, are recorded into a CSV.

In [45]:
def _compute_quality_score(text):
    if not isinstance(text, str) or not text.strip():
        return 0.0

    tokens = re.findall(r"[A-Za-z#@']+", text.lower())
    if not tokens:
        return 0.0

    num_tokens = len(tokens)
    unique_tokens = len(set(tokens))

    completeness = min(num_tokens, 20) / 20.0

   
    valid_count = 0
    for tok in tokens:
        clean_tok = tok.lstrip("#@")
        if not clean_tok:
            continue
        z = zipf_frequency(clean_tok, "en")
        if z >= 2.5: 
            valid_count += 1
    lang_like = valid_count / num_tokens if num_tokens else 0.0

    diversity = unique_tokens / num_tokens if num_tokens else 0.0

    score = 100.0 * (0.2 * completeness + 0.6 * lang_like + 0.2 * diversity)
    score = max(0.0, min(score, 100.0))
    return round(score, 1)

if __name__ == "__main__":
    # result_df = main()
    result_df = pd.read_csv(OUTPUT_CSV)

    if isinstance(result_df, pd.DataFrame) and not result_df.empty and "ocr_final" in result_df.columns:
        result_df["quality_score"] = [
            _compute_quality_score(text) for text in result_df["ocr_final"]
        ]
        result_df.to_csv(OUTPUT_CSV, index=False)
        print("✓ Added 'quality_score' column to CSV based on ocr_final text.")

✓ Added 'quality_score' column to CSV based on ocr_final text.


**Loading csv and metadata JSON step:**

This step loads the CSV containing video information and extracts base names for each video. It then reads matching JSON files to get descriptions and hashtags, joining hashtags into a single string. Missing files or fields are replaced with empty strings, and the results are added to the DataFrame to be used in the following steps.

In [46]:
CSV_PATH = "video_text_outputs.csv"
JSON_DIR = "meta"
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
SAVE_DIR = "embeddings_out/text768"
DEVICE = device
os.makedirs(SAVE_DIR, exist_ok=True)

In [47]:
# loads the csv
df = pd.read_csv(CSV_PATH).fillna("")
df["video_base"] = df["video"].apply(lambda x: os.path.splitext(os.path.basename(str(x)))[0])
print(f"{len(df)} video entries found in CSV!")

# loads the json
json_map = {}
for fname in os.listdir(JSON_DIR):
    if fname.lower().endswith(".json"):
        json_map[os.path.splitext(fname)[0]] = os.path.join(JSON_DIR, fname)

descs, hashtags_texts = [], []
for base in df["video_base"]:
    path = json_map.get(base)
    if not path:
        descs.append("")
        hashtags_texts.append("")
        continue
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        vm = data.get("video_metadata", data)
        descs.append(vm.get("description", "") or "")
        hashtags = vm.get("hashtags", [])
        if isinstance(hashtags, list):
            hashtags_texts.append(" ".join(f"#{h}" for h in hashtags))
        else:
            hashtags_texts.append(str(hashtags))
    except Exception:
        descs.append("")
        hashtags_texts.append("")

df["description"] = descs
df["hashtags_text"] = hashtags_texts

100 video entries found in CSV!


**Model loading and text field encoding step:**

- Load Sentence-BERT model (all-mpnet-base-v2) for text encoding.

- Encode four text types: OCR results, hashtags, ASR transcripts, descriptions → 768-dimensional embeddings.

- Combine embeddings using a weighted sum to form a single representation per video.

- The best weights [0.4, 0.1, 0.3, 0.2] were found via grid search, selecting the combination that maximizes mean average precision (mAP) using inferred video labels.

In [48]:
model = SentenceTransformer(MODEL_NAME, device=DEVICE)
weights = np.array([0.4, 0.1, 0.3, 0.2])
weights = weights / weights.sum()

print(f"Loaded model: {MODEL_NAME}")

modalities = ["ocr_final", "hashtags_text", "asr", "description"]
embs = {}

for m in modalities:
    print(f"Encoding {m}...")
    texts = df[m].fillna("").astype(str).tolist()
    embs[m] = model.encode(texts, batch_size=8, convert_to_numpy=True,
                           show_progress_bar=True, normalize_embeddings=True)
    
# concatenate all text
df["concatenated_text"] = df.apply(
    lambda row: " ".join([
        str(row.get("ocr_final", "")),
        str(row.get("hashtags_text", "")),
        str(row.get("asr", "")),
        str(row.get("description", ""))
    ]).strip(),
    axis=1
)

print("\n===== CONCATENATED TEXTS =====")
for i, text in enumerate(df["concatenated_text"].tolist(), start=1):
    print(f"[{i}] {text}\n")

Loaded model: sentence-transformers/all-mpnet-base-v2
Encoding ocr_final...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Encoding hashtags_text...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Encoding asr...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Encoding description...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]


===== CONCATENATED TEXTS =====
[1] told him i love vintage bags he said "oh like old bags" #holy yfckinairballll #coachvintage #y2kvintage #coachbag #vintagecoachbag 2.  PARA  3.  2.  3.  2.  2.  2.  3.  2.  3.  2.  3.  Oh.  3.  2.  2.  3.  3  1.  3.  3.  2. old but gold baybeh! 🥰 #coachvintage #y2kvintage #coachbag #vintagecoachbag

[2] i'm going out with the girls he said: what restaurant? #holyfuckingairball #fyp #airball 2.  3.  2.  3.  2.  3.  3.  You're gonna crack in the bank, you're gonna crack the price of you sitting on a track down.  Yes.  3.  2.  3.  4.  3.  4.  5.  2.  3.  4.  3.  4.  4.  3.  4. #fyp #airball

[3] Dear sir/madam: congratulations to our September 2024 graduates! Eliza Ann O. Masa, I'm an honor student. Your hard work, dedication, and perseverance have been recognized. PLM is delighted to announce that you have been selected to graduate in September 2024. Latin honor (blank if none): magna cum laude. Class of 2024. #holyfknairball #ohbsougadesarelike852 #na

**Fuse modalities and save:**

In this step, the embeddings from all four text modalities are fused into a single vector using the previously determined weights. Each combined embedding is then saved as a .npy file for each video.

In [49]:
combined_embs = (
    weights[0] * embs["ocr_final"] +
    weights[1] * embs["hashtags_text"] +
    weights[2] * embs["asr"] +
    weights[3] * embs["description"]
)

for i, row in tqdm(df.iterrows(), total=len(df), desc="Saving embeddings"):
    video_name = row["video_base"]
    out_path = os.path.join(SAVE_DIR, f"{video_name}_emb-text768.npy")
    np.save(out_path, combined_embs[i])
    print(f"✓ Saved: {out_path}")

print(f"\nAll text embeddings saved in: {SAVE_DIR}")

Saving embeddings: 100%|██████████| 100/100 [00:00<00:00, 603.48it/s]

✓ Saved: embeddings_out/text768\trend8vid1_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid10_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid2_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid3_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid4_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid5_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid6_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid7_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid8_emb-text768.npy
✓ Saved: embeddings_out/text768\trend8vid9_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid1_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid10_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid2_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid3_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid4_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid5_emb-text768.npy
✓ Saved: embeddings_out/text768\trend3vid6_emb-text768




---
## **RETRIEVING SIMILAR VIDEOS**
**Goal**: Produce a list of most similar videos based on a weighted combination of modality-specific cosine similarity scores.

**Embedding loading step:** creates a dict of embedding vectors following the below format to keep everything organized and so embedding retrieval for each video is trivial.
$$
video\_name \;\rightarrow\; \{ audio,\; video,\; text \}
$$

In [19]:
def load_all_embeddings(base_dir="embeddings_out"):
    base_dir = Path(base_dir)

    folders = {
        "audio":  base_dir / "audio2048",
        "visual": base_dir / "video2048",
        "text":   base_dir / "text768",
    }

    suffix_map = {
        "audio":  "audio2048",
        "visual": "visual2048",
        "text":   "text768",
    }

    modality_files = {} # collect keys per modality
    for modality, folder in folders.items():
        files = list(folder.glob(f"*emb-{suffix_map[modality]}.npy"))
        modality_files[modality] = {f.stem.split("_emb-")[0]: f for f in files}

    all_video_ids = set()
    for d in modality_files.values():
        all_video_ids.update(d.keys())

    embeddings = {}
    missing = []

    for vid in all_video_ids:
        embeddings[vid] = {}
        for modality in ["audio", "visual", "text"]:
            file = modality_files[modality].get(vid, None)
            if file is None:
                missing.append((vid, modality))
                embeddings[vid][modality] = None
            else:
                embeddings[vid][modality] = np.load(str(file))

    if missing:
        print("WARNING: Missing modality embeddings detected:") # just to be safe
        for vid, modality in missing:
            print(f"  - {vid} missing {modality}")

    return embeddings

embeddings = load_all_embeddings() # get embeddings with embeddings["video_name"]

# to check
for video, emb_vec in embeddings.items():
    print(video, emb_vec)


trend7vid5 {'audio': array([0.        , 0.        , 0.        , ..., 0.05984755, 0.46721104,
       0.        ], shape=(2048,), dtype=float32), 'visual': array([0.03079359, 0.00163207, 0.05173095, ..., 0.03130743, 0.04678841,
       0.01500522], shape=(2048,), dtype=float32), 'text': array([-2.09453646e-02, -1.61567640e-02, -1.05657206e-02, -6.40466381e-03,
        1.40214123e-02,  1.16940191e-02, -4.01438871e-02,  2.67493794e-02,
       -2.45832566e-02,  1.87951780e-02,  4.38726543e-02, -1.87657986e-02,
        9.36517436e-03,  4.63795502e-02, -2.47399770e-03,  2.57340882e-02,
       -1.72232563e-02,  2.52365059e-02,  3.90775479e-02, -7.83252977e-03,
        3.12331285e-02,  5.28351450e-03,  2.44374653e-02,  2.77135938e-02,
       -4.67647683e-02,  4.95857727e-03, -1.84886597e-02,  1.24817891e-02,
        7.13243745e-03,  1.20409144e-02, -1.35146246e-02, -2.20962036e-03,
       -1.42674541e-02, -1.90218222e-02,  1.99226167e-06,  1.16295354e-02,
       -2.64284199e-02,  1.80043713e-02,

>**NOTE: Input the query video here :))**

If testing different queries with the same set of videos, just <u>run the notebook starting at this cell</u> to skip the preprocessing and loading of embeddings.

In [20]:
# please type the EXACT filename of the query video
QUERY = "trend1vid1"

**Cosine similarity computation step:** computes modality-specific cosine similarity scores for each video and a query video, resulting in each video being represented as a vector of 3 similarity scores.

In [21]:
def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2)) # maybe cast to float?  

In [22]:
def compute_modality_similarities(query_video_name: str, embeddings_dir: Path):
    
    if query_video_name not in embeddings:
        raise ValueError(f"Query video '{query_video_name}' not found in embeddings.")
    
    query_emb = embeddings[query_video_name]
    
    similarity_dict = {}
    
    for video_name, video_emb in embeddings.items(): 
        if video_name == query_video_name: # skips self
            continue 
        
        sims = []
        missing_modalities = []
        
        for modality in ["audio", "visual", "text"]:
            if modality not in video_emb or modality not in query_emb:
                missing_modalities.append(modality)
                sims.append(np.nan)  # for missing embeddings
            else:
                sims.append(cosine_similarity(video_emb[modality], query_emb[modality]))
        
        if missing_modalities:
            print(f"[WARNING] {video_name} missing embeddings for: {', '.join(missing_modalities)}")
        
        similarity_dict[video_name] = np.array(sims)
    
    return similarity_dict

similarities = compute_modality_similarities(QUERY, "embeddings_out") # get similarity vector with similarities["video_name"]

# to check
for video, sim_vec in similarities.items():
    print(video, sim_vec)

trend7vid5 [0.66790867 0.54846168 0.30960394]
trend1vid7 [0.63098925 0.49394068 0.50911521]
trend5vid3 [0.67068011 0.41441622 0.38139237]
trend4vid3 [0.56371951 0.59076965 0.16326511]
trend10vid2 [0.56216264 0.59861761 0.2492289 ]
trend10vid8 [0.60133255 0.50390339 0.20095613]
trend2vid7 [0.67953593 0.52063012 0.2967313 ]
trend3vid9 [0.68049598 0.55398464 0.23342834]
trend9vid1 [0.54915684 0.48289904 0.4994475 ]
trend8vid2 [0.59978545 0.53550732 0.32112125]
trend4vid5 [0.61250609 0.39645073 0.21493243]
trend3vid6 [0.59579337 0.39125368 0.2289096 ]
trend6vid4 [0.66096681 0.48560375 0.32995979]
trend6vid8 [0.65869111 0.52957338 0.43260239]
trend5vid2 [0.67115718 0.51602495 0.27998498]
trend7vid6 [0.67350698 0.60280091 0.33880682]
trend2vid1 [0.76490551 0.48547408 0.32955167]
trend4vid1 [0.64098048 0.58948308 0.15646409]
trend2vid6 [0.76625746 0.46787262 0.36396447]
trend5vid7 [0.66599858 0.52965915 0.28553011]
trend2vid10 [0.75364226 0.45338115 0.43487792]
trend5vid5 [0.67362374 0.469651

>**NOTE: Input the weights here :))**

If testing different weights with the same query video and set of videos, just <u>run the notebook starting at this cell</u> to skip computing the modality-specific cosine similarity scores.

In [23]:
CONDITION_NAME = "video-only"
# baseline, audio-only, video-only, text-only, audio-video, audio-text, video-text, audio-video-text
# can also do "custom"

WEIGHT_AUDIO = WEIGHT_VIDEO = WEIGHT_TEXT = 0.0

if CONDITION_NAME == "baseline":
    WEIGHT_AUDIO = WEIGHT_VIDEO = WEIGHT_TEXT = 0.0
elif CONDITION_NAME == "audio-only":
    WEIGHT_AUDIO = 1.0
elif CONDITION_NAME == "video-only":
    WEIGHT_VIDEO = 1.0
elif CONDITION_NAME == "text-only":
    WEIGHT_TEXT = 1.0
elif CONDITION_NAME == "audio-video":
    WEIGHT_AUDIO = 0.5
    WEIGHT_VIDEO = 0.5
elif CONDITION_NAME == "audio-text":
    WEIGHT_AUDIO = 0.5
    WEIGHT_TEXT = 0.5
elif CONDITION_NAME == "video-text":
    WEIGHT_VIDEO = 0.5
    WEIGHT_TEXT = 0.5
elif CONDITION_NAME == "audio-video-text":
    WEIGHT_AUDIO = WEIGHT_VIDEO = WEIGHT_TEXT = 1/3
elif CONDITION_NAME == "custom":
    WEIGHT_AUDIO = 0
    WEIGHT_VIDEO = 0
    WEIGHT_TEXT = 0
else:
    raise ValueError(f"Unknown CONDITION_NAME: {CONDITION_NAME}")

print(f"Weights -> Audio: {WEIGHT_AUDIO}, Video: {WEIGHT_VIDEO}, Text: {WEIGHT_TEXT}")

Weights -> Audio: 0.0, Video: 1.0, Text: 0.0


**Weighted-sum fusion step:** uses weighted linear combination to form a final similarity score for each video and a query video, where the weights can be modified according to the different test cases.

In [24]:
def weighted_sum_fusion(similarity_dict, weight_audio, weight_video, weight_text):

    weights = np.array([weight_audio, weight_video, weight_text])
    weights = weights / weights.sum() # apparently we need to normalize this cuz it might not equal 1
    final_weighted_dict = {}
    
    for video, sim_vec in similarity_dict.items():
        if len(sim_vec) != 3:
            raise ValueError(f"Expected 3 modalities in similarity vector for {video}, got {len(sim_vec)}")
        
        sim_audio, sim_video, sim_text = sim_vec
        weighted_score = (sim_audio*weights[0] + sim_video*weights[1] + sim_text*weights[2])
        final_weighted_dict[video] = float(weighted_score)

    return final_weighted_dict

final_scores = weighted_sum_fusion(similarities, WEIGHT_AUDIO, WEIGHT_VIDEO, WEIGHT_TEXT)

# to check
for video, score in final_scores.items():
    print(video, score)

trend7vid5 0.5484616756439209
trend1vid7 0.49394068121910095
trend5vid3 0.41441622376441956
trend4vid3 0.5907696485519409
trend10vid2 0.5986176133155823
trend10vid8 0.5039033889770508
trend2vid7 0.5206301212310791
trend3vid9 0.5539846420288086
trend9vid1 0.4828990399837494
trend8vid2 0.535507321357727
trend4vid5 0.3964507281780243
trend3vid6 0.39125367999076843
trend6vid4 0.4856037497520447
trend6vid8 0.529573380947113
trend5vid2 0.5160249471664429
trend7vid6 0.6028009057044983
trend2vid1 0.4854740798473358
trend4vid1 0.5894830822944641
trend2vid6 0.46787261962890625
trend5vid7 0.5296591520309448
trend2vid10 0.45338115096092224
trend5vid5 0.46965137124061584
trend7vid8 0.4490763545036316
trend3vid4 0.6126582622528076
trend1vid10 0.6480363011360168
trend9vid10 0.56557297706604
trend1vid9 0.45811593532562256
trend7vid1 0.611046552658081
trend10vid3 0.3231119215488434
trend6vid6 0.4192381203174591
trend3vid3 0.6831607222557068
trend1vid5 0.5497331023216248
trend8vid8 0.5547696948051453
tr

**Ranking step:** uses the final scores from weighted sum fusion to rank all videos by their similarity score with the query video, printed in descending order.

***most_similar_videos*** is the final output which will be fed into the annotation generation section.

In [None]:
k = 9 # no of similar videos to retrieve (3-5 seems best)

def rank_by_score(final_weighted_dict, top_k=None):
    ranked_videos = sorted(final_weighted_dict.items(), key=lambda x: x[1], reverse=True)
    
    if top_k is not None:
        ranked_videos = ranked_videos[:top_k]
    
    return ranked_videos

most_similar_videos = rank_by_score(final_scores, top_k = k)

print(f"Top {k} most similar videos to {QUERY}")
for video, score in most_similar_videos:
    print(f"{video}: {score:.4f}")

similar_videos_output = [QUERY] + [video for video, _ in most_similar_videos]

print("\nGemini output array format:")
print(similar_videos_output)

Top 3 most similar videos to trend1vid1
trend3vid3: 0.6832
trend1vid10: 0.6480
trend8vid9: 0.6346

Gemini output array format:
['trend1vid1', 'trend3vid3', 'trend1vid10', 'trend8vid9']


---
## **EVALUATION: Kendall's Tau**
**Goal**: Compare the pipeline's similarity ranking against the ground truth similarity scores from a human participant study.

Kendall's Tau measures the ordinal correlation between two rankings. A value of +1 means perfect agreement, 0 means no correlation, and -1 means perfect disagreement.

**Steps:**
1. Load the ground truth similarity proportions from `similarity_proportion.xlsx`.
2. For the current `QUERY` video, extract its ground truth scores against all other videos.
3. Align the pipeline's `final_scores` with the ground truth scores for the same set of videos.
4. Compute Kendall's Tau between the two score vectors.

In [26]:
from scipy.stats import kendalltau
import openpyxl

def load_ground_truth(xlsx_path="similarity_proportion.xlsx", sheet_name="Copy of Similarity proportion"):
    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
    ws = wb[sheet_name]
    rows = list(ws.iter_rows(values_only=True))
    wb.close()

    # Row 1 (index 1) has column video IDs in "vid#-trend#" format, starting at column index 2
    col_ids = rows[1][2:]
    col_video_names = []
    for vid_id in col_ids:
        if vid_id is not None:
            vid_num, trend_num = str(vid_id).split("-")
            col_video_names.append(f"trend{trend_num}vid{vid_num}")
        else:
            col_video_names.append(None)

    # Data rows start at index 2; each row has [trend_label, "vid#-trend#", score1, score2, ..., score100]
    gt_matrix = {}
    for row in rows[2:]:
        if row[1] is None:
            continue
        vid_num, trend_num = str(row[1]).split("-")
        row_video = f"trend{trend_num}vid{vid_num}"
        scores = {}
        for j, col_name in enumerate(col_video_names):
            if col_name is not None and col_name != row_video:
                val = row[j + 2]
                scores[col_name] = float(val) if val is not None else 0.0
        gt_matrix[row_video] = scores

    return gt_matrix

gt_matrix = load_ground_truth()
print(f"Ground truth loaded: {len(gt_matrix)} videos")
print(f"Sample keys: {list(gt_matrix.keys())[:5]}")

Ground truth loaded: 100 videos
Sample keys: ['trend1vid1', 'trend1vid2', 'trend1vid3', 'trend1vid4', 'trend1vid5']


In [28]:
def compute_kendall_tau(query, pipeline_scores, gt_matrix):
    if query not in gt_matrix:
        return None, None, []

    gt_scores = gt_matrix[query]

    # Align: only compare videos present in both pipeline and ground truth
    common_videos = sorted(set(pipeline_scores.keys()) & set(gt_scores.keys()))

    if len(common_videos) < 2:
        return None, None, common_videos

    pipeline_vals = [pipeline_scores[v] for v in common_videos]
    gt_vals = [gt_scores[v] for v in common_videos]

    tau, p_value = kendalltau(pipeline_vals, gt_vals)
    return tau, p_value, common_videos

# All 7 conditions and their weights (audio, video, text)
CONDITIONS = {
    "audio-only":       (1.0,   0.0,   0.0),
    "video-only":       (0.0,   1.0,   0.0),
    "text-only":        (0.0,   0.0,   1.0),
    "audio-video":      (0.5,   0.5,   0.0),
    "audio-text":       (0.5,   0.0,   0.5),
    "video-text":       (0.0,   0.5,   0.5),
    "audio-video-text": (1/3,   1/3,   1/3),
}

all_video_ids = sorted(embeddings.keys())

# Precompute modality similarities once (they don't change across conditions)
print("Precomputing modality similarities for all query videos...")
all_sims = {}
for query_vid in all_video_ids:
    all_sims[query_vid] = compute_modality_similarities(query_vid, "embeddings_out")
print(f"Done. {len(all_sims)} query videos ready.\n")

# Store summary across all conditions for a final comparison table
summary_rows = []

for condition_name, (w_audio, w_video, w_text) in CONDITIONS.items():
    print("=" * 60)
    print(f"=== Kendall's Tau Evaluation: {condition_name} ===")
    print(f"Weights -> Audio: {w_audio:.4f}, Video: {w_video:.4f}, Text: {w_text:.4f}")
    print(f"Total videos: {len(all_video_ids)}")
    print("=" * 60)

    results = []

    for query_vid in all_video_ids:
        scores = weighted_sum_fusion(all_sims[query_vid], w_audio, w_video, w_text)
        tau, p_val, common = compute_kendall_tau(query_vid, scores, gt_matrix)

        if tau is not None:
            results.append({
                "query": query_vid,
                "tau": tau,
                "p_value": p_val,
                "n_compared": len(common),
                "significant": p_val < 0.05
            })

    results_df = pd.DataFrame(results)

    # Per-trend summary
    results_df["trend"] = results_df["query"].str.extract(r"(trend\d+)")
    trend_summary = results_df.groupby("trend")["tau"].agg(["mean", "std", "count"]).round(4)
    trend_summary.columns = ["Mean Tau", "Std Tau", "Videos"]
    print("\n--- Per-Trend Summary ---")
    print(trend_summary.to_string())

    # Overall average
    avg_tau = results_df["tau"].mean()
    std_tau = results_df["tau"].std()
    sig_count = results_df["significant"].sum()

    print(f"\n--- Overall Results ---")
    print(f"Average Kendall's Tau: {avg_tau:.4f} (+/- {std_tau:.4f})")
    print(f"Statistically significant (p < 0.05): {sig_count}/{len(results_df)} videos")
    print(f"Final Retrieval Ranking Score: {avg_tau:.4f}")
    print()

    summary_rows.append({
        "Condition": condition_name,
        "Avg Tau": round(avg_tau, 4),
        "Std Tau": round(std_tau, 4),
        "Significant": f"{sig_count}/{len(results_df)}",
    })

# Final comparison table across all conditions
print("=" * 60)
print("               === SUMMARY: All Conditions ===")
print("=" * 60)
summary_df = pd.DataFrame(summary_rows)
print(summary_df.to_string(index=False))

Precomputing modality similarities for all query videos...
Done. 100 query videos ready.

=== Kendall's Tau Evaluation: audio-only ===
Weights -> Audio: 1.0000, Video: 0.0000, Text: 0.0000
Total videos: 100

--- Per-Trend Summary ---
         Mean Tau  Std Tau  Videos
trend                             
trend1     0.0442   0.0702      10
trend10    0.2085   0.0410      10
trend2     0.3855   0.0354      10
trend3     0.2979   0.0356      10
trend4     0.3515   0.0298      10
trend5     0.2378   0.0958      10
trend6     0.3278   0.0481      10
trend7     0.3979   0.0240      10
trend8     0.3851   0.0533      10
trend9     0.3820   0.0929      10

--- Overall Results ---
Average Kendall's Tau: 0.3018 (+/- 0.1198)
Statistically significant (p < 0.05): 88/100 videos
Final Retrieval Ranking Score: 0.3018

=== Kendall's Tau Evaluation: video-only ===
Weights -> Audio: 0.0000, Video: 1.0000, Text: 0.0000
Total videos: 100

--- Per-Trend Summary ---
         Mean Tau  Std Tau  Videos
trend   

---
## **GENERATING ANNOTATIONS**
**Goal**: Produce annotations based on the query video and list of similar videos produced using the earlier weights.

In [None]:
QUERY_VIDEO_PATH = rf"media\{QUERY}.mp4"
BASE_MEDIA_PATH = r"media" 

CONTEXT_VIDEO_TUPLES = most_similar_videos

TOP_K = k

OUTPUT_DIR = f"annotations\{QUERY}"
os.makedirs(OUTPUT_DIR, exist_ok=True)
JSON_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"{QUERY}_{CONDITION_NAME}.json")
CSV_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"{QUERY}_{CONDITION_NAME}.csv")

In [None]:
try:
    GEMINI_API_KEY = getpass.getpass("Enter your Gemini API Key: ")
except ImportError:
    GEMINI_API_KEY = "PASTE_YOUR_API_KEY_HERE"
    
GENAI_MODEL_NAME = "gemini-2.5-flash"

In [None]:
def build_paths(similar_list: List, base_path: str, k: int):
    video_names = [video_name for video_name, _ in similar_list]
    top_k_names = video_names[:k]
    return [os.path.join(base_path, f"{name}.mp4") for name in top_k_names]

if CONDITION_NAME == "baseline":
    CONTEXT_VIDEO_PATHS = []
else:
    CONTEXT_VIDEO_PATHS = build_paths(CONTEXT_VIDEO_TUPLES, BASE_MEDIA_PATH, TOP_K)

print(f"--- Preparing Annotation for: {QUERY} ---")
print(f"Condition: {CONDITION_NAME}")
print(f"Context Videos: {CONTEXT_VIDEO_PATHS}")

**Core Logic: Prompts & Helper Functions**

This cell defines the "brains" of the annotation experiment.

* **System Prompts:** `BASELINE_SYSTEM` and `CONTEXT_AWARE_SYSTEM` are the two "personalities" we give the AI. This is the core of our A/B test.
* **`_upload_video`:** This is a crucial optimization. It uploads a video file to the Gemini API and then **caches** the result. If we try to upload the same video (e.g., the query video) 8 times, this cache ensures it only *actually* uploads it the first time, making our runs much faster.
* **`_make_model`:** This helper creates the Gemini model and correctly passes our "system prompt" to it using a **keyword argument** (`system_instruction=...`) to avoid errors.

In [None]:
BASELINE_SYSTEM = "You are an assistant tasked with generating a brief summary of a short video. Use only the information available in the video. Do not rely on any external knowledge or assumptions. Focus on describing what is happening in the video concisely."
CONTEXT_AWARE_SYSTEM = "You are an assistant tasked with generating a summary of a short video. You are provided with the main video and a few additional videos that are semantically related. Use all available information to generate a summary that best describes what is happening in the main video. Focus on enhancing your understanding using the related videos, but ensure the summary reflects the main video."
BASELINE_USER = "Please generate a 2–3 sentence summary of the following video based solely on its content."
CONTEXT_AWARE_USER = "Please summarize the main video using all the information provided. The first video is the main one, and the others are related videos that may provide helpful context. Your summary should describe what is happening in the main video in 2–3 sentences."

_uploaded_cache: Dict[str, Any] = {}
def _upload_video(path: str):
    global _uploaded_cache
    full = str(Path(path).resolve())
    if full not in _uploaded_cache:
        print(f"Uploading: {full}")
        try:
            file_obj = genai.upload_file(path=full)
            print("Uploaded, waiting for processing...")
            while True:
                file_obj = genai.get_file(file_obj.name)
                if file_obj.state.name == "ACTIVE":
                    print(f"File is ACTIVE: {file_obj.name}"); break
                elif file_obj.state.name == "FAILED":
                    raise RuntimeError(f"File {file_obj.name} failed to process.")
                time.sleep(2)
            _uploaded_cache[full] = file_obj
        except Exception as e:
            print(f"Error uploading {path}: {e}"); return None
    return _uploaded_cache.get(full)

def _make_model(system_instruction: str):
    return genai.GenerativeModel(
        model_name=GENAI_MODEL_NAME,
        system_instruction=system_instruction
    )

**Run the Annotation**

This is the final "Run" button for this experiment. This cell takes all the variables you just set up in the configuration cell and:

1.  Configures the Gemini API with a secure `GEMINI_API_KEY`.
2.  Uploads the main `QUERY_VIDEO_PATH` using the cache.
3.  Checks the `CONDITION_NAME`:
    * If it's `"baseline"`, it runs the "control" experiment (video only).
    * If it's anything else, it runs the "context-aware" experiment, uploading and attaching the `CONTEXT_VIDEO_PATHS` to the prompt.
4.  Saves the final annotation text from Gemini into the `.json` and `.csv` files defined (e.g., `trend1vid1_visual_only.csv`).

In [None]:
genai.configure(api_key=GEMINI_API_KEY)
print("--- Starting Annotation ---")

_uploaded_cache = {} 
annotation_text = ""
final_record = {}

query_file = _upload_video(QUERY_VIDEO_PATH)

if query_file is None:
    print(f"Aborting: Failed to upload main query video {QUERY_VIDEO_PATH}")
else:
    if CONDITION_NAME == "baseline":
        print("Running BASELINE annotation...")
        model = _make_model(BASELINE_SYSTEM)
        try:
            response = model.generate_content([query_file, BASELINE_USER])
            annotation_text = response.text.strip()
        except Exception as e:
            print(f"Error in baseline generation: {e}"); annotation_text = f"ERROR: {e}"
        
        final_record = {
            "query_id": QUERY, "condition_name": CONDITION_NAME,
            "context_video_paths": [], "annotation_text": annotation_text,
        }

    else:
        print(f"Running CONTEXT-AWARE annotation for: {CONDITION_NAME}...")
        model = _make_model(CONTEXT_AWARE_SYSTEM)
        
        ctx_files = []
        for p in CONTEXT_VIDEO_PATHS:
            f = _upload_video(p)
            if f: ctx_files.append(f)
        
        contents = [query_file] + ctx_files + [CONTEXT_AWARE_USER]
        
        try:
            response = model.generate_content(contents)
            annotation_text = response.text.strip()
        except Exception as e:
            print(f"Error in context generation: {e}"); annotation_text = f"ERROR: {e}"
        
        final_record = {
            "query_id": QUERY, "condition_name": CONDITION_NAME,
            "context_video_paths": CONTEXT_VIDEO_PATHS, "annotation_text": annotation_text,
        }

    print("\n--- Annotation Complete ---")
    print(f"Result: {annotation_text}")

    if final_record:
        pd.DataFrame([final_record]).to_csv(CSV_OUTPUT_PATH, index=False)
        with open(JSON_OUTPUT_PATH, "w", encoding="utf-8") as f:
            json.dump(final_record, f, indent=2, ensure_ascii=False)
        print(f"Saved results to:\n  {CSV_OUTPUT_PATH}\n  {JSON_OUTPUT_PATH}")
    else:
        print("No result to save.")