In [1]:
import sys
sys.executable

'/data/horse/ws/mehu311f-myproject/VAST/.venv/bin/python'

In [2]:
import inspect
from vast.video_downloader import download_video

print(inspect.getsource(download_video))

def download_video(url, output_dir, audio_dir):
    output_dir.mkdir(parents=True, exist_ok=True)

    ydl_opts = {
        "outtmpl": str(output_dir / "%(title)s.%(ext)s"),
        "format": (
            "bestvideo[vcodec*=avc1][ext=mp4]+"
            "bestaudio[acodec*=mp4a]/best[ext=mp4]"
        ),
        "merge_output_format": "mp4",
        "quiet": False,
        "noplaylist": True,
    }

    print(f"Downloading video: {url}")

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_file = Path(ydl.prepare_filename(info)).with_suffix(".mp4")

    codec = get_video_codec(video_file)
    print(f"Detected video codec: {codec}")

    if codec not in ("h264", "avc1"):
        video_file = convert_to_vscode_compatible(video_file)

    # extract wav audio
    wav_file = extract_wav_audio(video_file, audio_dir)

    print(f"Final video file: {video_file}")
    print(f"Final audio file: {wav_file}")

    return video_file, wav_file

In [3]:
import torch

def get_device():
    if torch.cuda.is_available():
        print("Using NVIDIA GPU:", torch.cuda.get_device_name(0))
        return torch.device("cuda")
    else:
        print("No GPU found → using CPU")
        return torch.device("cpu")

device = get_device()
device


Using NVIDIA GPU: NVIDIA H100


device(type='cuda')

# 1. Load Configuration File

In [4]:
from pathlib import Path
from box import Box  
from vast.utils import load_yaml

# Read configuration file
cfg = Box(load_yaml("config.yaml"))

# Access configuration parameters
print(cfg.video_downloader.video_url)

print(cfg.subtitle_generator.model.whisper_size)
print(cfg.subtitle_generator.model.language)
print(cfg.paths.raw_audios)


https://www.youtube.com/watch?v=jpgku1_n2y4
small
de
data/raw_audios


# 2. Download Videos

In [5]:
from pathlib import Path
from vast.video_downloader import download_video

url = cfg.video_downloader.video_url
output_dir = Path(cfg.paths.raw_videos)
audio_dir = Path(cfg.paths.raw_audios)

video_file, wav_file = download_video(url, output_dir, audio_dir)

print("Video:", video_file)
print("WAV:", wav_file)


Downloading video: https://www.youtube.com/watch?v=jpgku1_n2y4
[youtube] Extracting URL: https://www.youtube.com/watch?v=jpgku1_n2y4
[youtube] jpgku1_n2y4: Downloading webpage




[youtube] jpgku1_n2y4: Downloading android sdkless player API JSON
[youtube] jpgku1_n2y4: Downloading web safari player API JSON




[youtube] jpgku1_n2y4: Downloading m3u8 information




[info] jpgku1_n2y4: Downloading 1 format(s): 137+140
[download] data/raw_videos/Emergency Room in Slow German ｜ Super Easy German 293.mp4 has already been downloaded
Detected video codec: h264
WAV already exists: data/raw_audios/Emergency Room in Slow German ｜ Super Easy German 293.wav
Final video file: data/raw_videos/Emergency Room in Slow German ｜ Super Easy German 293.mp4
Final audio file: data/raw_audios/Emergency Room in Slow German ｜ Super Easy German 293.wav
Video: data/raw_videos/Emergency Room in Slow German ｜ Super Easy German 293.mp4
WAV: data/raw_audios/Emergency Room in Slow German ｜ Super Easy German 293.wav


# 3. Generate Subtitles

In [6]:
from vast.subtitle_generator import generate_subtitle

video_path = video_file
output_dir = Path(cfg.paths.subtitles)
model = cfg.subtitle_generator.model
print(video_path)

generate_subtitle(video_path, output_dir, model)

data/raw_videos/Emergency Room in Slow German ｜ Super Easy German 293.mp4
------------------: {'whisper_size': 'small', 'language': 'de'}
Loading Whisper model: small
Transcribing audio... (language=de)
Subtitle (.srt) created: data/subtitles/Emergency Room in Slow German ｜ Super Easy German 293.srt
Transcript (.json) created: data/subtitles/Emergency Room in Slow German ｜ Super Easy German 293_subtitles.json


{'srt_path': PosixPath('data/subtitles/Emergency Room in Slow German ｜ Super Easy German 293.srt'),
 'json_path': PosixPath('data/subtitles/Emergency Room in Slow German ｜ Super Easy German 293_subtitles.json')}

# 4. Extract Keyframes

In [7]:
from pathlib import Path
from vast.keyframe_extractor import extract_speaker_diarization
#
wav_path = wav_file

# speaker diarization 输出目录
audio_keyframe_dir = Path("data/keyframes/audio")

audio_segments = extract_speaker_diarization(
    wav_path=wav_path,
    output_dir=audio_keyframe_dir
)

print(audio_segments[:3])


  from .autonotebook import tqdm as notebook_tqdm


Running speaker diarization (pyannote.audio 3.4 / 4.x)
Audio: data/raw_audios/Emergency Room in Slow German ｜ Super Easy German 293.wav
Using NVIDIA GPU: NVIDIA H100
Device: cuda


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


Exporting RTTM to: data/keyframes/audio/Emergency_Room_in_Slow_German___Super_Easy_German_293.rttm
RTTM saved to data/keyframes/audio/Emergency_Room_in_Slow_German___Super_Easy_German_293.rttm
Speaker diarization JSON saved to data/keyframes/audio/speaker_diarization.json
Detected 205 segments
[{'index': 0, 'speaker': 'SPEAKER_02', 'start': 0.419, 'end': 6.882}, {'index': 1, 'speaker': 'SPEAKER_02', 'start': 8.35, 'end': 9.261}, {'index': 2, 'speaker': 'SPEAKER_02', 'start': 11.017, 'end': 11.456}]


# 5. Segment Videos

In [6]:
from vast.scene_segmenter import detect_scenes, export_scenes
from pathlib import Path

video_path = Path(cfg["paths"]["raw_videos"]) / "Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024.mp4"

keyframes = sorted(Path(cfg["paths"]["keyframes"]).glob("*.jpg"))

output_dir = Path(cfg["paths"]["sections"])
# 1️ 检测场景边界
scenes = detect_scenes(keyframes, interval=60.0, method="ssim", threshold=0.6)

# 2️ 导出分段视频 + 自动保存 scene_segments.json
export_scenes(video_path, scenes, output_dir=output_dir)

  from .autonotebook import tqdm as notebook_tqdm
Detecting scene boundaries: 100%|██████████| 32/32 [00:06<00:00,  5.09it/s]


Detected 8 scenes.


Exporting scenes: 100%|██████████| 8/8 [00:04<00:00,  1.94it/s]

Export complete! 8 scenes saved to: data/sections
Scene metadata saved to data/sections/scene_segments.json





[{'scene_id': 0,
  'start': 0.0,
  'end': 660.0,
  'duration': 660.0,
  'video_path': 'data/raw_videos/Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024.mp4',
  'output_file': 'data/sections/scene_000.mp4'},
 {'scene_id': 1,
  'start': 660.0,
  'end': 720.0,
  'duration': 60.0,
  'video_path': 'data/raw_videos/Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024.mp4',
  'output_file': 'data/sections/scene_001.mp4'},
 {'scene_id': 2,
  'start': 720.0,
  'end': 960.0,
  'duration': 240.0,
  'video_path': 'data/raw_videos/Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024.mp4',
  'output_file': 'data/sections/scene_002.mp4'},
 {'scene_id': 3,
  'start': 960.0,
  'end': 1020.0,
  'duration': 60.0,
  'video_path': 'data/raw_videos/Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024.mp4',
  'output_file': 'data/sections/scene_003.mp4'},
 {'scene_id': 4,
  'start': 102

# 6. Analyze Scenes

In [7]:
from vast.scene_analyzer import analyze_directory

input_dir = Path(cfg.paths.keyframes)
output_dir = Path(cfg.paths.scene_descriptions)
model_name = cfg.scene_analyzer.model.name

print(model_name)

analyze_directory(input_dir, output_dir, model_name)

print("Scene analysis completed!")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Salesforce/blip-image-captioning-base
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0000.jpg
Loading BLIP model: Salesforce/blip-image-captioning-base ...
BLIP model loaded successfully.
Loading sentiment model: cardiffnlp/twitter-roberta-base-sentiment


Device set to use cuda:0


Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0001.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0002.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0003.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0004.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0005.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0006.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0007.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0008.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0009.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus La

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0011.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0012.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0013.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0014.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0015.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0016.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0017.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0018.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_0019.jpg
Analyzing: Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus La

# 7. Summarize Video Clips(Texts)

In [2]:
from vast.text_summarizer import summarize_sections

summarize_sections(
    subtitles_json="data/subtitles/Hitzige Debatte über Alkohol-Konsum in Deutschland ｜ Markus Lanz vom 12. März 2024_subtitles.json",
    segments_json="data/sections/scene_segments.json",
    output_json="data/text_analysis/text_summaries.json",
    summarizer_model="facebook/bart-large-cnn",
    language="de"
)


  from .autonotebook import tqdm as notebook_tqdm


Loaded 848 subtitles and 8 segments
Loading summarization model: ml6team/mt5-small-german-finetune-mlsum


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 0: summarized 1883 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 1: summarized 156 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 2: summarized 737 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 3: summarized 150 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 4: summarized 347 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 5: summarized 133 words.


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Section 6: summarized 2166 words.
Section 7: summarized 167 words.
Summaries saved to data/text_analysis/text_summaries.json


[{'start': 0.0,
  'end': 660.0,
  'text': 'Vanessa Grasnickl ist bei uns und Natalie Stüben sitzt bei uns. Und ich würde zunächst mal gerne Frau Grasnickl fragen, weil es um die Frage geht, wie verhält man sich zu verschiedenen Themen? Gibt gerade große Aufregung und vielleicht können wir ihren aktuellen Tweet mal zum Thema Cannabis-Freikabel zeigen. Ich weiß nicht endlich, warum ich auf Frau Stülm-Media bin, weil Sie sie mal lesen, Herr Lanz. Ich bin Ihr treuester Fan. Selten kriegt ihr mal so viel Stofffrahlhaus und in dem Sinne auch noch doppeldeutig gemeint. Frau Grasnickl, wenn Sie dann so etwas lesen wie diesen Tweet hier, vielleicht können wir da einmal ganz kurz zuspielen. Wenn es nach der Ampel geht, dann darf man mit 30 Gramm Cannabis auf dem Spielplatz Kinder zukiffen. Der Staat hat einen Schutzversprechen vor allem Kindern und Jugendlichen gegenüber und so weiter und so weiter. Die Legalisierung darf nicht kommen. Dann habe ich Sie richtig verstanden, auch in Interviews, we

# 8. Generate Narrations

In [3]:
from vast.narration_generator import generate_narration_from_summaries

generate_narration_from_summaries(
    summaries_json="data/text_analysis/text_summaries.json",
    output_dir="data/audio",
    lang="de"  # 德语语音
)


Generating narration for 8 sections...


Generating narration:  12%|█▎        | 1/8 [00:01<00:09,  1.37s/it]

Saved narration: scene_000.mp3


Generating narration:  25%|██▌       | 2/8 [00:04<00:14,  2.38s/it]

Saved narration: scene_001.mp3


Generating narration:  38%|███▊      | 3/8 [00:06<00:10,  2.15s/it]

Saved narration: scene_002.mp3


Generating narration:  50%|█████     | 4/8 [00:07<00:06,  1.74s/it]

Saved narration: scene_003.mp3


Generating narration:  62%|██████▎   | 5/8 [00:08<00:04,  1.34s/it]

Saved narration: scene_004.mp3


Generating narration:  75%|███████▌  | 6/8 [00:09<00:02,  1.41s/it]

Saved narration: scene_005.mp3


Generating narration:  88%|████████▊ | 7/8 [00:11<00:01,  1.43s/it]

Saved narration: scene_006.mp3


Generating narration: 100%|██████████| 8/8 [00:12<00:00,  1.54s/it]

Saved narration: scene_007.mp3
Narration metadata saved to data/audio/narration_metadata.json





[{'start': 0.0,
  'end': 660.0,
  'text': 'Vanessa Grasnickl ist bei uns und Natalie Stüben sitzt bei uns. Und ich würde zunächst mal gerne Frau Grasnickl fragen, weil es um die Frage geht, wie verhält man sich zu verschiedenen Themen? Gibt gerade große Aufregung und vielleicht können wir ihren aktuellen Tweet mal zum Thema Cannabis-Freikabel zeigen. Ich weiß nicht endlich, warum ich auf Frau Stülm-Media bin, weil Sie sie mal lesen, Herr Lanz. Ich bin Ihr treuester Fan. Selten kriegt ihr mal so viel Stofffrahlhaus und in dem Sinne auch noch doppeldeutig gemeint. Frau Grasnickl, wenn Sie dann so etwas lesen wie diesen Tweet hier, vielleicht können wir da einmal ganz kurz zuspielen. Wenn es nach der Ampel geht, dann darf man mit 30 Gramm Cannabis auf dem Spielplatz Kinder zukiffen. Der Staat hat einen Schutzversprechen vor allem Kindern und Jugendlichen gegenüber und so weiter und so weiter. Die Legalisierung darf nicht kommen. Dann habe ich Sie richtig verstanden, auch in Interviews, we

# 9. Generate Sign Language

In [4]:
import json
print(json.dumps(json.load(open("test.ipynb")), indent=2, ensure_ascii=False))


{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "c8ddcb11",
      "metadata": {},
      "source": [
        "# 1. Load Configuration File"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "d3ae3af5",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "https://www.youtube.com/watch?v=oXjxK_X1U28\n",
            "small\n",
            "de\n"
          ]
        }
      ],
      "source": [
        "from pathlib import Path\n",
        "from box import Box  \n",
        "from vast.utils import load_yaml\n",
        "\n",
        "# Read configuration file\n",
        "cfg = Box(load_yaml(\"config.yaml\"))\n",
        "\n",
        "# Access configuration parameters\n",
        "print(cfg.video_downloader.video_url)\n",
        "\n",
        "print(cfg.subtitle_generator.model.whisper_size)\n",
        "print(cfg.subtitle_generator.model.lang