# Применение Whisper для распознавания текста по видео


In [2]:
!pip install -q git+https://github.com/openai/whisper.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00

In [3]:
from google.colab import files
import whisper
import os
import torch
from tqdm import tqdm
import time

In [4]:
# указание нужных путей
VIDEO_DIR = "dataset"
OUTPUT_DIR = "transcriptions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
#интерактивная загрузка файла, довольно долгая
uploaded = files.upload()

os.makedirs(VIDEO_DIR, exist_ok=True)

for filename in uploaded:
    os.rename(filename, os.path.join(VIDEO_DIR, filename))
    print(f"Загружено: {filename}")

KeyboardInterrupt: 

In [6]:
# дополнительная проверка наличия файла с видео
def check_directory(directory_path):
    # Проверка существования директории
    if not os.path.isdir(directory_path):
        raise FileNotFoundError(f"Директория '{directory_path}' не существует или не является директорией")

    # Получение списка содержимого
    contents = os.listdir(directory_path)
    if not contents:
        raise ValueError("Директория пуста")

    # Проверка наличия .mp4 файлов
    has_mp4 = any(
        os.path.isfile(os.path.join(directory_path, item)) and item.lower().endswith(".mp4")
        for item in contents
    )

    if not has_mp4:
        raise ValueError("Отсутствует видео для обработки")

    print("Проверка пройдена")

In [9]:
try:
    check_directory(VIDEO_DIR)
except Exception as e:
    print(f"Ошибка: {e}")

Проверка пройдена


In [10]:
# загрузка модели Whisper, после нескольких тестов была определена оптимальная версия - turbo
if torch.cuda.is_available():
  model = whisper.load_model("turbo", device="cuda")
else:
  model = whisper.load_model("turbo", device="cpu")

# предпочтение у cuda, сильно ускоряет работу

100%|█████████████████████████████████████| 1.51G/1.51G [00:26<00:00, 60.1MiB/s]


In [11]:
def format_timestamp(seconds: float) -> str:
    """Форматирует секунды в формат SRT hh:mm:ss,mm"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{millis:02}"

In [15]:
# старая функция
def transcribe_video(video_path):
    # распознование речи (применение whisper)
    result = model.transcribe(
        video_path,
        language="ru",
        temperature = 0.0
    )

    # сохранение в файл
    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f:
        f.write(result["text"])

In [12]:
def transcribe_with_progress(video_path):
    print(f"Обработка: {video_path}")
    result = model.transcribe(
        video_path,
        language="ru",
        temperature = 0.0,
        verbose=False
    )

    segments = result["segments"]
    total_segments = len(segments)
    base_filename = os.path.splitext(os.path.basename(video_path))[0]

    # текстовый файл
    txt_path = os.path.join(OUTPUT_DIR, base_filename + ".txt")
    # SRT файл
    srt_path = os.path.join(OUTPUT_DIR, base_filename + ".srt")

    start_time = time.time()

    with open(txt_path, "w", encoding="utf-8") as txt_file, \
         open(srt_path, "w", encoding="utf-8") as srt_file, \
         tqdm(total=total_segments) as pbar:

        for i, segment in enumerate(segments):
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            text = segment["text"].strip()

            txt_file.write(text + "\n")

            # запись SRT
            srt_file.write(f"{i+1}\n{start} --> {end}\n{text}\n\n")

            # Обновление прогресса
            elapsed = time.time() - start_time
            avg_time = elapsed / (i + 1)
            eta = avg_time * (total_segments - (i + 1))
            pbar.update(1)
            pbar.set_postfix_str(f"ETA: {int(eta)}s")

    print(f"\nВыполнено\nTXT: {txt_path}\nSRT: {srt_path}")


In [13]:
video_file = next((f for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4")), None)

if video_file:
    video_path = os.path.join(VIDEO_DIR, video_file)
    transcribe_with_progress(video_path)
else:
    print("Нет видео для обработки")

Обработка: dataset/video.mp4


100%|██████████| 28723/28723 [00:17<00:00, 1682.28frames/s]
100%|██████████| 67/67 [00:00<00:00, 1506.11it/s, ETA: 0s]


Выполнено
TXT: transcriptions/video.txt
SRT: transcriptions/video.srt





In [14]:
# быстрое скачивание файлов, временно для работы с гитом
for filename in os.listdir(OUTPUT_DIR):
    filepath = os.path.join(OUTPUT_DIR, filename)
    files.download(filepath)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# на всякий случай обрабатываем все аудиофайлы в директории
# устаревшее решение, оставляем пока ради цикла
for video_file in os.listdir(VIDEO_DIR):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(VIDEO_DIR, video_file)
        transcribe_video(video_path)
        print(f"Обработан файл {video_file}")

Обработан файл video2text.mp4
Обработан файл video.mp4


# Не очень удачные эксперименты с моделью faster-whisper


In [4]:
pip install faster-whisper==0.7.1 --extra-index-url https://huggingface.github.io/transformers/whl/cu118

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/transformers/whl/cu118


In [3]:
!pip install faster-whisper==0.7.1 --extra-index-url https://huggingface.github.io/transformers/whl/cu118
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/transformers/whl/cu118


In [5]:
from faster_whisper import WhisperModel

In [6]:
model_fast = WhisperModel("small", device="cuda", compute_type="float16")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def transcribe_video_fast(video_path):
    segments, info = model_fast.transcribe(video_path, language="ru", beam_size=5)

    # Подготовим прогрессбар по длительности
    total_duration = info.duration
    current_duration = 0.0

    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f, tqdm(total=total_duration, unit="sec") as pbar:
        for segment in segments:
            f.write(segment.text + "\n")
            # Обновим прогрессбар на длительность сегмента
            pbar.update(segment.end - current_duration)
            current_duration = segment.end

In [None]:
video_file = next((f for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4")), None)
if video_file:
    video_path = os.path.join(VIDEO_DIR, video_file)
    transcribe_video_fast(video_path)
    print(f"Обработан файл {video_file}")
else:
    print("В директории нет .mp4 файлов.")

  0%|          | 0/451.8613125 [00:00<?, ?sec/s]

In [2]:
import torch
torch.cuda.is_available()

True

In [None]:
import os
from tqdm import tqdm
from faster_whisper import WhisperModel

# Пути
VIDEO_DIR = "dataset"
OUTPUT_DIR = "transcriptions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Загружаем модель (используем GPU, если доступен)
model = WhisperModel("small", device="cuda", compute_type="float16")

def transcribe_video(video_path):
    segments, info = model.transcribe(video_path, language="ru", beam_size=5)

    total_duration = info.duration
    current_time = 0.0

    text_file = os.path.join(OUTPUT_DIR, os.path.basename(video_path).replace(".mp4", ".txt"))
    with open(text_file, "w", encoding="utf-8") as f, tqdm(total=total_duration, unit="sec") as pbar:
        for segment in segments:
            f.write(segment.text.strip() + "\n")
            pbar.update(segment.end - current_time)
            current_time = segment.end


video_file = next((f for f in os.listdir(VIDEO_DIR) if f.endswith(".mp4")), None)

if video_file:
    video_path = os.path.join(VIDEO_DIR, video_file)
    print(f"Обрабатываем: {video_path}")
    transcribe_video(video_path)
    print(f"Готово: {video_file}")
else:
    print("Нет .mp4 файлов в директории dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Обрабатываем: dataset/video2text.mp4


  0%|          | 0/451.8613125 [00:00<?, ?sec/s]