In [23]:
import ffmpeg
import boto3
from transformers import pipeline
import os
import torch

# Step 1: Transcribe Video
def transcribe_video(video_path):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny",device=device)
    transcription = transcriber(video_path, return_timestamps="word")
    return transcription

# Step 2: Extract Frames and Pair with Dialogue
def extract_frames(video_path, interval):

    output_dir = os.path.splitext(video_path)[0]
    os.makedirs(output_dir, exist_ok=True)
    
    # Use ffmpeg to extract frames
    (
        ffmpeg
        .input(video_path)
        .filter('fps', fps=1/interval)
        .output(f'{output_dir}/frame_%04d.png')
        .run()
    )
    
    # Collect the frame file paths
    frame_files = sorted([os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith('.png')])
    # create a dictionary of frame file paths and their corresponding timestamps according to interval and video length

    frames_and_intervals = []
    interval_start = 0
    for i, frame in enumerate(frame_files):
        single_frame = {
            "frame_path": frame,
            "timestamp": (interval_start, interval_start + (interval))
        }
        frames_and_intervals.append(single_frame)
        interval_start+=interval
    
    # get vieo duration and make the last frame the same as the duration
    video_duration = ffmpeg.probe(video_path)['format']['duration']
    final_frame_start_time = frames_and_intervals[-1]["timestamp"][0]
    frames_and_intervals[-1]["timestamp"] = (final_frame_start_time, float(video_duration))
    return frames_and_intervals





frame=   80 fps= 14 q=-0.0 size=N/A time=00:20:00.00 bitrate=N/A speed= 216x    

In [31]:
# try the  transcription on the file in the folder
test_file = "mlsearch_webinar.mp4"

#without timestamps about 1.5 minutes for 45min of video
# 15s to pull the frames. getting the stuff in the right format is fast, basically negligible
#with timestamps, about same time
# we can use speculative decoding to speed up the process by 2.2x ltr, when we have a good pipline going

transcription = transcribe_video(test_file)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In [25]:
# get video frames
INTERVAL=15
frames = extract_frames(test_file, 15)
# get dialogue image pairs

ffmpeg version 7.0.2 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

In [26]:
frames[0:3]

[{'frame_path': 'mlsearch_webinar/frame_0001.png', 'timestamp': (0, 15)},
 {'frame_path': 'mlsearch_webinar/frame_0002.png', 'timestamp': (15, 30)},
 {'frame_path': 'mlsearch_webinar/frame_0003.png', 'timestamp': (30, 45)}]

In [41]:
def assign_words_to_frames(transcription, frames, interval=INTERVAL):
    # given transcription word chunks and frames that occur w.r.t interval, assign words to frames
    # Transcription will be on word level, so for each frame, we find the words that occur in the time interval of the frame
    # and assign them to the frame

    frames_and_words = []

    for f in frames:
        frame_start, frame_end = f["timestamp"][0], f["timestamp"][1]
        
        # filter for words that fall in frame
        words_in_frame = list(filter(lambda w: w['timestamp'][0] > frame_start and w['timestamp'][1] < frame_end, transcription["chunks"]))
        words = [w['text'] for w in words_in_frame]
        words= "".join(words)
        single_frame = {
            "frame_path": f["frame_path"],
            "words": words,
            "timestamp": f["timestamp"]
        }
        frames_and_words.append(single_frame)
    return frames_and_words



In [42]:
# align the new frames with the dialogue, so we have a list of dictionaries
# each dictionary will be the frame filepath, the dialogue, and the timestamp in a metadata field


def align_frames_with_dialogue(frames, transcription):
    #Pinecone for t
    dialogue_frames = []
    for frame, chunk in zip(frames, transcription["chunks"]):
        dialogue_frames.append({
            "frame": frame,
            "dialogue": chunk["text"],
            "metadata": {
                "timestamp": chunk["timestamp"]
            }
        })
    return dialogue_frames

In [43]:
frames_and_words = assign_words_to_frames(transcription, frames)

In [44]:
frames_and_words[0]["words"]

" All right, welcome everybody. Thanks for coming today. This webinar is going to be on the magic of multi-lingual search. I'm really excited to help everybody learn about using and applying multi-lingual search to whatever you might"

In [46]:
# write frames and words out as json

with open("frames_and_words.json", "w") as f:
    json.dump(frames_and_words, f)