In [1]:
pip install torch transformers librosa moviepy opencv-python-headless sentence-transformers


Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.4 kB ? eta -:--:--
     -------------------------------------- 44.4/44.4 kB 218.8 kB/s eta 0:00:00
Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
     ---------------------------------------- 0.0/388.3 kB ? eta -:--:--
     - -------------------------------------- 10.2/388.3 kB ? eta -:--:--
     -- ---------------------------------- 30.7/388.3 kB 435.7 kB/s eta 0:00:01
     --- --------------------------------- 41.0/388.3 kB 393.8 kB/s eta 0:00:01
     ------ ------------------------------ 71.7/388.3 kB 393.8 kB/s eta 0:00:0

In [2]:
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load Wav2Vec2 ASR model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

def speech_to_text(audio_file):
    audio, rate = librosa.load(audio_file, sr=16000)
    input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
from transformers import pipeline

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text, max_length=150):
    summary = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
    return summary[0]['summary_text']


In [None]:
import cv2

def extract_key_frames(video_path, interval=60):
    cap = cv2.VideoCapture(video_path)
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
    key_frames = []

    while cap.isOpened():
        frame_id = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        ret, frame = cap.read()

        if not ret:
            break
        if frame_id % (frame_rate * interval) == 0:
            key_frames.append(frame)

    cap.release()
    return key_frames


In [None]:
from moviepy.editor import VideoFileClip

def align_summary_with_video(summary, video_path):
    clip = VideoFileClip(video_path)
    duration = clip.duration  # Video duration in seconds

    # Divide the summary into equal segments
    sentences = summary.split(". ")
    segment_duration = duration / len(sentences)

    # Align each sentence with the corresponding video segment
    summary_with_timestamps = [
        {"text": sentence, "start": i * segment_duration, "end": (i + 1) * segment_duration}
        for i, sentence in enumerate(sentences)
    ]

    return summary_with_timestamps


In [None]:
def process_video(video_path, audio_path):
    # Step 1: Transcribe audio to text
    transcript = speech_to_text(audio_path)

    # Step 2: Summarize the transcript
    summary = summarize_text(transcript)

    # Step 3: Extract key frames
    key_frames = extract_key_frames(video_path)

    # Step 4: Align summary with video
    aligned_summary = align_summary_with_video(summary, video_path)

    return aligned_summary, key_frames


In [None]:
video_path = "video.mp4"
audio_path = "audio.mp3"

# Process the video and get the summary with aligned keyframes
aligned_summary, key_frames = process_video(video_path, audio_path)

# Print the aligned summary with timestamps
for segment in aligned_summary:
    print(f"[{segment['start']} - {segment['end']}]: {segment['text']}")
