# Применение Whisper для распознавания текста по видео


In [7]:
pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-o8d97ruo
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-o8d97ruo
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [8]:
import whisper
import os
import torch
from tqdm import tqdm

In [3]:
# указание нужных путей
VIDEO_DIR = "dataset"
OUTPUT_DIR = "transcriptions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [12]:
# дополнительная проверка наличия файла с видео
def check_directory(directory_path):
    # Проверка существования директории
    if not os.path.isdir(directory_path):
        raise FileNotFoundError(f"Директория '{directory_path}' не существует или не является директорией")

    # Получение списка содержимого
    contents = os.listdir(directory_path)
    if not contents:
        raise ValueError("Директория пуста")

    # Проверка наличия .mp4 файлов
    has_mp4 = any(
        os.path.isfile(os.path.join(directory_path, item)) and item.lower().endswith(".mp4")
        for item in contents
    )

    if not has_mp4:
        raise ValueError("Отсутствует видео для обработки")

    print("Проверка пройдена")

In [13]:
try:
    check_directory(VIDEO_DIR)
except Exception as e:
    print(f"Ошибка: {e}")

Проверка пройдена


In [14]:
# Загружаем модель Whisper, после нескольких тестов была определена оптимальная версия - turbo
if torch.cuda.is_available():
  model = whisper.load_model("turbo", device="cuda")
else:
  model = whisper.load_model("turbo", device="cpu")

In [15]:
def transcribe_video(video_path):
    # распознование речи (применение whisper)
    result = model.transcribe(
        video_path,
        language="ru",
        temperature = 0.0
    )

    # сохранение в файл
    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f:
        f.write(result["text"])

In [16]:
# На всякий случай обрабатываем все аудиофайлы в директории
for video_file in os.listdir(VIDEO_DIR):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(VIDEO_DIR, video_file)
        transcribe_video(video_path)
        print(f"Обработан файл {video_file}")

Обработан файл video2text.mp4
Обработан файл video.mp4


# Не очень удачные эксперименты с моделью faster-whisper


In [4]:
pip install faster-whisper==0.7.1 --extra-index-url https://huggingface.github.io/transformers/whl/cu118

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/transformers/whl/cu118


In [3]:
!pip install faster-whisper==0.7.1 --extra-index-url https://huggingface.github.io/transformers/whl/cu118
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/transformers/whl/cu118


In [5]:
from faster_whisper import WhisperModel

In [6]:
model_fast = WhisperModel("small", device="cuda", compute_type="float16")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def transcribe_video_fast(video_path):
    segments, info = model_fast.transcribe(video_path, language="ru", beam_size=5)

    # Подготовим прогрессбар по длительности
    total_duration = info.duration
    current_duration = 0.0

    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f, tqdm(total=total_duration, unit="sec") as pbar:
        for segment in segments:
            f.write(segment.text + "\n")
            # Обновим прогрессбар на длительность сегмента
            pbar.update(segment.end - current_duration)
            current_duration = segment.end

In [None]:
video_file = next((f for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4")), None)
if video_file:
    video_path = os.path.join(VIDEO_DIR, video_file)
    transcribe_video_fast(video_path)
    print(f"Обработан файл {video_file}")
else:
    print("В директории нет .mp4 файлов.")

  0%|          | 0/451.8613125 [00:00<?, ?sec/s]

In [2]:
import torch
torch.cuda.is_available()

True

In [None]:
import os
from tqdm import tqdm
from faster_whisper import WhisperModel

# Пути
VIDEO_DIR = "dataset"
OUTPUT_DIR = "transcriptions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Загружаем модель (используем GPU, если доступен)
model = WhisperModel("small", device="cuda", compute_type="float16")

def transcribe_video(video_path):
    segments, info = model.transcribe(video_path, language="ru", beam_size=5)

    total_duration = info.duration
    current_time = 0.0

    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f, tqdm(total=total_duration, unit="sec") as pbar:
        for segment in segments:
            f.write(segment.text.strip() + "\n")
            pbar.update(segment.end - current_time)
            current_time = segment.end


video_file = next((f for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4")), None)

if video_file:
    video_path = os.path.join(VIDEO_DIR, video_file)
    print(f"Обрабатываем: {video_path}")
    transcribe_video(video_path)
    print(f"Готово: {video_file}")
else:
    print("Нет .mp4 файлов в директории dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Обрабатываем: dataset/video2text.mp4


  0%|          | 0/451.8613125 [00:00<?, ?sec/s]