### convert a video into 30 second chunk audio files saved as timestamp ranges

In [None]:
# Import necessary modules
import os
from datetime import timedelta

from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from pydub.silence import split_on_silence

# Open the video file and extract the audio
video = VideoFileClip("video.mp4")
audio = video.audio

# Save the audio to a temporary file
temp_file = "temp.wav"
audio.write_audiofile(temp_file)

# Load the audio file using pydub
sound = AudioSegment.from_wav(temp_file)

# Split the audio into 30-second chunks based on silence
chunks = split_on_silence(
    sound,
    min_silence_len=500,
    silence_thresh=-16,
    keep_silence=500,
    seek_step=1
)

# Save each chunk as a separate audio file with the timestamp range in the filename
for i, chunk in enumerate(chunks):
    # Calculate the start and end times for the chunk
    start = timedelta(milliseconds=chunk.start_time)
    end = timedelta(milliseconds=chunk.end_time)

    # Format the start and end times as strings
    start_str = start.strftime("%H-%M-%S-%f")[:-3]
    end_str = end.strftime("%H-%M-%S-%f")[:-3]

    # Save the chunk as an audio file with the timestamp range in the filename
    chunk.export(f"audio_{start_str}_{end_str}.mp3", format="mp3")

# Clean up the temporary file
os.remove(temp_file)


### iterate a directory of audio files and transcribes each one saving to a text file

In [None]:
import os

from moviepy.editor import AudioFileClip

# Define the directory containing the audio files
audio_dir = "audio"

# Iterate over the audio files in the directory
for filename in os.listdir(audio_dir):
    # Check if the file is an audio file
    if not filename.endswith(".mp3"):
        continue

    # Construct the full path to the audio file
    filepath = os.path.join(audio_dir, filename)

    # Open the audio file using moviepy
    audio = AudioFileClip(filepath)

    # Transcribe the audio using your chosen transcription service
    transcript = transcribe_audio_file(audio)

    # Save the transcript to a text file
    output_file = os.path.splitext(filepath)[0] + ".txt"
    with open(output_file, "w") as f:
        f.write(transcript)

### Run each segment through an embedding model

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

for transcript in transcripts:
    vector = model.encode(transcript)

### Store each segment into a vector search engine

In [None]:
import pymongo

connection = pymongo.MongoClient(mongo_uri)
vector_collection = connection['vectors']

vector_collection.insert(transcript)


### Run similarity search on "bubble sort" which returns the exact timestamp and video id 

In [None]:
query = "bubble sort"                      
vector_query = model.encode(query).tolist()

pipeline = [
    {
        "$search": {
            "knnBeta": {
                "vector": vector_query,
                "path": "embedding",
                "k": 10
            }
        }
    }
]

results = list(connection[database][collection].aggregate(pipeline))