In [1]:
from qdrant_client.models import ScalarQuantization, ScalarQuantizationConfig, ScalarType
from qdrant_client.http.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer
from qdrant_client.http.models import SearchParams
from qdrant_client import QdrantClient
from faster_whisper import WhisperModel
from pydub.utils import make_chunks
from datasets import load_dataset
from pydub import AudioSegment
import moviepy.editor as mp
from openai import OpenAI
from copy import deepcopy
import simpleaudio as sa
from tqdm import tqdm
import numpy as np
import time
import uuid
import json
import re
import os

tedlium = load_dataset("LIUM/tedlium", "release1")

  from tqdm.autonotebook import tqdm, trange


Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

In [2]:
with open("queries.json", "r") as f:
    queries = json.load(f)
    print(queries)



In [3]:
# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

completion = client.chat.completions.create(
  model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
  messages=[
    {"role": "system", "content": "You will be given one query and k sentences. For each sentence, you need to determine if it is a similar text to the query. If it is similar semantically, you should mark it as 1. If it is not similar, you should mark it as 0. You have to return a list of 0s and 1s (where 0 means different, and 1 means similar) of length k, like the following: [0, 1, 1, 0, 0, 1]. YOU MUST ONLY RETURN THE LIST OF THE RELEVANCE AS YOUR ANSWER. Note: similarity means that the query and the sentence talk about one things, or have one idea, or similar structurally, or similar semantically, or have one meaning."},
    {}
    {"role": "user", "content": "Query: " + queries[0]["query"] + "\nSentences:\n" + "\n".join(["0: " + queries[0]['original'], "1: " + queries[1]['original']])},
  ],
  temperature=0.4,
  max_tokens=100
)

print(completion.choices[0].message)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (877873185.py, line 8)

In [58]:
print("Query: " + queries[0]["query"] + "\nSentences:\n" + "\n".join([queries[0]['original'], queries[1]['original']]))

Query: why a specific entity like persephone's marriage contract, rather than another arbitrary one?
Sentences:
the explanatory role of  persephone  's marriage contract  could be played equally well by  infinitely many  other  ad hoc  entities  why a marriage contract  and not  any  other reason  for 
here are three questions  that i like to use  to test the truthiness  of our representation  in any  media story  one 


In [52]:
print(queries[0]["query"])
print(queries[0]["original"])

why a specific entity like persephone's marriage contract, rather than another arbitrary one?
the explanatory role of  persephone  's marriage contract  could be played equally well by  infinitely many  other  ad hoc  entities  why a marriage contract  and not  any  other reason  for 


In [4]:
print(tedlium["train"][0])

{'audio': {'path': None, 'array': array([-0.0012207 , -0.00518799,  0.00765991, ...,  0.00442505,
       -0.00942993, -0.01245117]), 'sampling_rate': 16000}, 'text': "<sil> so {SMACK} for(2) example {BREATH} there are(2) doctors in china who believe that it's their job to keep you healthy {BREATH} so any month <sil> you are healthy you pay them(2) <sil> and when(4) you're sick {SMACK} you don't have(2) to pay them because(2) they failed at their job", 'speaker_id': 'DerekSivers_2009I', 'gender': 2, 'file': '/Users/citadel/.cache/huggingface/datasets/downloads/extracted/261c9fe9ed6072b9d54e979afcd1b8bcc2a2f043991ddd463ce8ebe931129998/train/DerekSivers_2009I.sph', 'id': 'DerekSivers_2009I-105.78-114.78-<o,f0,male>'}


In [5]:
def remove_braces_and_lower_from_batch(row):
    # Pattern to match content inside parentheses, curly braces, and square braces including the braces
    pattern = r'\(.*?\)|\{.*?\}|\[.*?\]|\<.*?\>'
    transformed_row = deepcopy(row)
    # Substitute the matches with an empty string
    for i in range(len(row['text'])):
        cleaned_text = re.sub(pattern, '', row['text'][i])
        transformed_row['text'][i] = cleaned_text.lower()
    
    return transformed_row

In [6]:
tedlium.set_transform(remove_braces_and_lower_from_batch)

In [7]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(tedlium["train"]["text"], normalize_embeddings=True)

In [8]:
def dim_reduction(embeddings):
    cov = np.cov(embeddings.T)
    eig_vals, eig_vecs = np.linalg.eig(cov)

    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    tot = sum(eig_vals)
    var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)
    var90_ind = np.argmax((cum_var_exp > 90))
    proj_matrix = np.hstack([eig_pairs[i][1].reshape(-1, 1) for i in range(var90_ind + 1)])
    
    return proj_matrix

In [9]:
proj_matrix = dim_reduction(embeddings)
embeddings = np.dot(embeddings, proj_matrix)

In [10]:
embeddings.shape

(56803, 209)

In [11]:
qdrant_client = QdrantClient("http://localhost:6333")
collection_name = "video_segments"

In [12]:
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=embeddings.shape[1],
        distance=Distance.COSINE,
        quantization_config=ScalarQuantization(
            scalar=ScalarQuantizationConfig(
                type=ScalarType.INT8,
                always_ram=True
            )
        )
    )
)

True

In [13]:
batch_size = 1000 

# Extract video_id and timestamp for the video segments
filenames, starts, ends = [], [], []
for i in range(tedlium["train"].num_rows):
    filename, start, end, _ = tedlium['train'][i]['id'].split('-')
    filenames.append(filename)
    starts.append(float(start))
    ends.append(float(end))

points = []
for i, record in tqdm(enumerate(tedlium["train"]), total=len(tedlium["train"]), desc="Processing points"):
    embedding = embeddings[i]  
    text = record["text"]
    
    point = PointStruct(
        id=i,
        vector=embedding,
        payload={
            "video_id": filenames[i],
            "timestamp": f"{starts[i]:.2f}-{ends[i]:.2f}",
            "text": text
        }
    )
    points.append(point)
    
    if len(points) >= batch_size:
        qdrant_client.upsert(collection_name=collection_name, points=points)
        points = []

# Insert remaining points
if points:
    qdrant_client.upsert(collection_name=collection_name, points=points)

print("Data successfully loaded into Qdrant.")

Processing points: 100%|██████████| 56803/56803 [01:38<00:00, 574.38it/s] 


Data successfully loaded into Qdrant.


In [14]:
def extract_audio(video_path):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio_path = video_path.rsplit(".", 1)[0] + ".wav"
    audio.write_audiofile(audio_path)
    return audio_path

In [15]:
whisper_model = WhisperModel("base.en", device="cpu", compute_type="int8")

In [16]:
def transcribe_audio(audio_path, whisper_model):
    segments, info = whisper_model.transcribe(audio_path, word_timestamps=True)
    return segments

In [21]:
def process_and_upload_video(file_path, whisper_model):
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv')
    audio_extensions = ('.wav', '.mp3', '.flac')

    if file_path.lower().endswith(video_extensions):
        audio_path = extract_audio(file_path)
    elif file_path.lower().endswith(audio_extensions):
        audio_path = file_path
    else:
        raise ValueError("Unsupported file format. Please provide a video or audio file.")
    
    print("Loading audio file...")
    # Use the correct method based on the file extension
    if audio_path.lower().endswith('.wav'):
        audio = AudioSegment.from_wav(audio_path)
    elif audio_path.lower().endswith('.mp3'):
        audio = AudioSegment.from_mp3(audio_path)
    elif audio_path.lower().endswith('.flac'):
        audio = AudioSegment.from_file(audio_path, format="flac")
    else:
        raise ValueError("Unsupported audio format. Please provide a WAV, MP3, or FLAC file.")

    chunk_length_ms = 10 * 1000  # 10 seconds in milliseconds
    chunks = make_chunks(audio, chunk_length_ms)

    points = []
    base_id = uuid.uuid4().int >> 64
    processed_chunks = []
    
    print(f"Processing {len(chunks)} chunks...")
    for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Processing chunks"):
        chunk_path = f"temp_chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")

        segments = transcribe_audio(chunk_path, whisper_model)
        if segments:
            text = " ".join([segment.text for segment in segments])
            start_time = i * 10  # Start time in seconds
            end_time = (i + 1) * 10  # End time in seconds
            
            processed_chunks.append({
                "id": base_id + i,
                "video_id": os.path.basename(file_path),
                "timestamp": f"{start_time:.2f}-{end_time:.2f}",
                "text": text,
                "audio_chunk": chunk
            })

            # Generate embedding
            embedding = model.encode(text, normalize_embeddings=True)
            embedding = np.dot(embedding, proj_matrix)

            point = PointStruct(
                id=base_id + i,
                vector=embedding.tolist(),
                payload={
                    "video_id": os.path.basename(file_path),
                    "timestamp": f"{start_time:.2f}-{end_time:.2f}",
                    "text": text
                }
            )
            points.append(point)

        os.remove(chunk_path)

        if len(points) >= 1000:
            qdrant_client.upsert(collection_name=collection_name, points=points)
            points = []

    if points:
        qdrant_client.upsert(collection_name=collection_name, points=points)

    print(f"Video {file_path} processed and uploaded to Qdrant.")

    if file_path.lower().endswith(video_extensions):
        os.remove(audio_path)
        
    return processed_chunks

In [22]:
def play_and_display_chunk(chunk):
    print(f"Chunk ID: {chunk['id']}")
    print(f"Video ID: {chunk['video_id']}")
    print(f"Timestamp: {chunk['timestamp']}")
    print(f"Text: {chunk['text']}")
    
    # Convert the audio chunk to a numpy array
    audio_data = np.array(chunk['audio_chunk'].get_array_of_samples())
    
    # Play the audio
    play_obj = sa.play_buffer(
        audio_data.tobytes(),
        num_channels=chunk['audio_chunk'].channels,
        bytes_per_sample=chunk['audio_chunk'].sample_width,
        sample_rate=chunk['audio_chunk'].frame_rate
    )
    
    # Wait for playback to finish
    play_obj.wait_done()

In [23]:
def verify_qdrant_entry(chunk_id):
    # Retrieve the point from Qdrant
    points = qdrant_client.retrieve(
        collection_name=collection_name,
        ids=[chunk_id]
    )
    
    if points:
        point = points[0]
        print("Data in Qdrant:")
        print(f"ID: {point.id}")
        print(f"Video ID: {point.payload['video_id']}")
        print(f"Timestamp: {point.payload['timestamp']}")
        print(f"Text: {point.payload['text']}")
    else:
        print(f"No data found for chunk ID {chunk_id}")

In [24]:
video_path = "multimedia/tom_hardy_interview.mp4"
processed_chunks = process_and_upload_video(video_path, whisper_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

MoviePy - Writing audio in multimedia/tom_hardy_interview.wav


                                                                      

MoviePy - Done.
Loading audio file...
Processing 19 chunks...


Processing chunks: 100%|██████████| 19/19 [00:35<00:00,  1.84s/it]

Video multimedia/tom_hardy_interview.mp4 processed and uploaded to Qdrant.





In [16]:
# Play and display each chunk, and verify its entry in Qdrant
for chunk in processed_chunks:
    play_and_display_chunk(chunk)
    verify_qdrant_entry(chunk['id'])
    
    # Ask user if they want to continue to the next chunk
    if input("Press Enter to continue to the next chunk, or type 'q' to quit: ").lower() == 'q':
        break

Chunk ID: 3993838373608836783
Video ID: tom_hardy_interview.mp4
Timestamp: 0.00-10.00
Text:  You
Data in Qdrant:
ID: 3993838373608836783
Video ID: tom_hardy_interview.mp4
Timestamp: 0.00-10.00
Text:  You
Chunk ID: 3993838373608836784
Video ID: tom_hardy_interview.mp4
Timestamp: 10.00-20.00
Text:  My relationship with Charlie grew from phone calls only first.
Data in Qdrant:
ID: 3993838373608836784
Video ID: tom_hardy_interview.mp4
Timestamp: 10.00-20.00
Text:  My relationship with Charlie grew from phone calls only first.


In [25]:
# Function to convert from seconds to MM:SS format
def seconds_to_hms(seconds):
    minutes, seconds = divmod(seconds, 60)
    if minutes < 60:
        return f"{int(minutes):02d}:{int(seconds):02d}"
    else:
        hours, minutes = divmod(minutes, 60)
        return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

In [26]:
meta_model = SentenceTransformer("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

In [27]:
def discounted_cumulative_gain_k(relevance, k=3):
    return np.sum(relevance / np.log2(np.arange(2, k + 2)))

def normalized_discounted_cumulative_gain_k(relevance, k=3):
    dcg_k = discounted_cumulative_gain_k(relevance, k)
    idcg_k = discounted_cumulative_gain_k(np.ones(k), k)
    return dcg_k / idcg_k

In [33]:
def average_precision_k(relevance, k=3):
        r = relevance[:k].sum()
        if r == 0:
            return 0
        numerator = np.cumsum(relevance[:k]) * relevance[:k]
        denominator = r * np.arange(1, k + 1)
        ap_k = (numerator / denominator).sum()
        return ap_k

def compute_metrics(gt, threshold=0.7, list_k=[1, 3, 5]):
    metrics = dict()
    relevance = (gt > threshold).flatten()
    for k in list_k:
        ap = average_precision_k(relevance, k)
        recall = sum(relevance[:k]) / (sum(relevance) + 1e-9)
        hit_rate = sum(relevance[:k]) > 0
        ndcg = normalized_discounted_cumulative_gain_k(relevance[:k], k)
    
        metrics[k] = {'AP': ap, 'recall': recall, 'hit_rate': hit_rate, 'ndcg': ndcg}
    return metrics


In [37]:
def search_similar_segments(query_text, top_k=10, ef=128, list_k=[1, 3, 5], proj_matrix=None):
    start_time = time.time()
    query_embedding = model.encode(query_text).tolist()
    if proj_matrix is not None:
        query_embedding = np.dot(query_embedding, proj_matrix)
    
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k,
        search_params=SearchParams(
            hnsw_ef=ef  
        )
    )
    
    end_time = time.time()
    inference_time = end_time - start_time
    
    similarity = meta_model.similarity(meta_model.encode(query_text, normalize_embeddings=True), meta_model.encode([result.payload["text"] for result in search_result], normalize_embeddings=True))
    
    return search_result, inference_time, compute_metrics(similarity.numpy(), list_k=list_k)


In [38]:
# Example query
query = "sometimes you gotta cut a little piece of yourself"
results, search_time, metrics = search_similar_segments(query, proj_matrix=proj_matrix)

print(f"Search completed in {search_time:.4f} seconds")
for result in results:
    print(f"Score: {result.score:.4f}")
    print(f"Text: {result.payload['text']}")
    print(f"Video ID: {result.payload['video_id']}")
    print(f"Timestamp: {result.payload['timestamp']}")
    print(f"Converted timestamp: {seconds_to_hms(float(result.payload['timestamp'].split('-')[0]))} - {seconds_to_hms(float(result.payload['timestamp'].split('-')[1]))}")
    print()
    
print(metrics)

Search completed in 0.6002 seconds
Score: 0.8502
Text:  And sometimes you've got to cut a little piece of yourself off, no matter how much it
Video ID: tom_hardy_interview.mp4
Timestamp: 150.00-160.00
Converted timestamp: 02:30 - 02:40

Score: 0.5951
Text:   but  this is how i  do work  i do  take pieces and bits and  look at  it  and struggle with it  and  cut it away and of  course it's  not going to look like  that  but  it is  the  crazy way i  tend to
Video ID: FrankGehry_1990
Timestamp: 2512.61-2525.42
Converted timestamp: 41:52 - 42:05

Score: 0.5438
Text: to laser  cut my 
Video ID: MarianBantjes_2010
Timestamp: 729.35-730.91
Converted timestamp: 12:09 - 12:10

Score: 0.5360
Text: you lay down a couple of very simple rules  always cut away from your body  keep the blade sharp  never force it  and  these are things kids can understand and practice with and  yeah they're going to cut themselves  i have some terrible scars on my legs from where i stabbed myself
Video ID: GeverTull

In [None]:
# Example query
list_k = [1, 3, 5]
metrics_list = dict([[k, dict([[m, []] for m in ['AP', 'recall', 'hit_rate', 'ndcg']])] for k in list_k])
times_list = []

for sample in tqdm(queries, total=len(queries), desc="Calculating metrics"):
    query = sample['query']
    results, search_time, sample_metrics = search_similar_segments(query, proj_matrix=proj_matrix)
    times_list.append(search_time)
    for k in sample_metrics:
        for metric in sample_metrics[k]:
            metrics_list[k][metric].append(sample_metrics[k][metric])
            
metrics = dict([[k, dict([[m, 0] for m in ['AP', 'recall', 'hit_rate', 'ndcg']])] for k in list_k])
times = sum(times_list) / len(times_list)
for k in metrics_list:
    for metric in metrics_list[k]:
        metrics[k][metric] = sum(metrics_list[k][metric]) / len(metrics_list[k][metric])