In [None]:
!pip install -U openai-whisper faster-whisper tqdm pandas requests imageio-ffmpeg

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m1

In [3]:
import os
import platform
from pathlib import Path

import torch
import imageio_ffmpeg

print('Python:', platform.python_version())
print('PyTorch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('GPU name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU')

# Подключим ffmpeg из imageio-ffmpeg
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
print('ffmpeg:', ffmpeg_path)
os.environ['PATH'] = str(Path(ffmpeg_path).parent) + os.pathsep + os.environ.get('PATH', '')
assert Path(ffmpeg_path).exists(), 'ffmpeg не найден'


Python: 3.12.12
PyTorch: 2.8.0+cu126
CUDA available: True
GPU name: Tesla T4
ffmpeg: /usr/local/lib/python3.12/dist-packages/imageio_ffmpeg/binaries/ffmpeg-linux-x86_64-v7.0.2


In [None]:
from pathlib import Path
import os
import pandas as pd
from tqdm.notebook import tqdm

# Пути
DATASET_PATH = Path('/content/dataset_rasti_v_it.csv')
PROGRESS_PATH = Path('data/transcripts_progress.csv')
ERRORS_PATH = Path('data/transcripts_errors.csv')

# Столбцы исходного CSV
COL_ID_EXAM = 'Id экзамена'
COL_ID_QUESTION = 'Id вопроса'
COL_NUM_QUESTION = '№ вопроса'
COL_TRANSCRIPT = 'Транскрибация ответа'
COL_URL = 'Ссылка на оригинальный файл записи'

# Столбцы выходного CSV (просьба пользователя)
OUTPUT_COLUMNS = [
    COL_ID_EXAM,
    COL_ID_QUESTION,
    COL_NUM_QUESTION,
    COL_TRANSCRIPT,
    COL_URL,
]

LANGUAGE = 'ru'
WHISPER_MODEL_NAME = 'small'  # для скорости; можно вернуть 'medium' при необходимости качества
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
FP16 = (DEVICE == 'cuda')
NUM_WORKERS = os.cpu_count() or 4  # используем все ядра CPU
THREAD_COUNT = 4  # по вашей просьбе
BATCH_SIZE = 128  # по вашей просьбе (A100 выдержит)
VAD_MIN_SILENCE_MS = 400  # агрессивнее срезаем тишину для ускорения

# Создадим каталоги, если нужно
for p in [PROGRESS_PATH.parent, ERRORS_PATH.parent]:
    p.mkdir(parents=True, exist_ok=True)

print('Device:', DEVICE, 'fp16:', FP16, 'num_workers:', NUM_WORKERS, 'thread_count:', THREAD_COUNT, 'batch_size:', BATCH_SIZE, 'vad_min_silence_ms:', VAD_MIN_SILENCE_MS, 'model:', WHISPER_MODEL_NAME)


Device: cuda fp16: True


In [7]:
import pandas as pd

assert DATASET_PATH.exists(), f'Файл не найден: {DATASET_PATH}'

df = pd.read_csv(DATASET_PATH, sep=';', encoding='utf-8', dtype=str)
df = df.fillna('')

required_cols = {COL_ID_EXAM, COL_ID_QUESTION, COL_NUM_QUESTION, COL_URL}
missing = required_cols.difference(df.columns)
assert not missing, f'В датасете отсутствуют столбцы: {missing}'

print('Строк в исходном датасете:', len(df))
df.head(3)


Строк в исходном датасете: 9798


Unnamed: 0,Id экзамена,Id вопроса,№ вопроса,Текст вопроса,Картинка из вопроса,Оценка экзаменатора,Транскрибация ответа,Ссылка на оригинальный файл записи
0,3373871,30625752,1,<p>Добро пожаловать на экзамен! <br>Вопрос уст...,,1,Ваше первое задание. Вам нужно начать диалог. ...,https://storage.yandexcloud.net/odin-exam-file...
1,3373871,30625753,2,<p>Вопрос устной части экзамена. Примите участ...,,2,Теперь вам нужно принять участие в диалоге. От...,https://storage.yandexcloud.net/odin-exam-file...
2,3373871,30625754,3,<p>Вопрос устной части экзамена. Начните диало...,,1,"Получите нужную вам информацию, будьте вежливы...",https://storage.yandexcloud.net/odin-exam-file...


In [8]:
import pandas as pd

# Создаём/читаем файл прогресса
if PROGRESS_PATH.exists():
    progress_df = pd.read_csv(PROGRESS_PATH, sep=';', encoding='utf-8', dtype=str).fillna('')
    processed_keys = set(
        zip(progress_df.get(COL_ID_EXAM, []),
            progress_df.get(COL_ID_QUESTION, []),
            progress_df.get(COL_NUM_QUESTION, []))
    )
else:
    pd.DataFrame(columns=OUTPUT_COLUMNS).to_csv(PROGRESS_PATH, sep=';', encoding='utf-8', index=False)
    processed_keys = set()

# Создаём/читаем файл ошибок
if not ERRORS_PATH.exists():
    pd.DataFrame(columns=[COL_ID_EXAM, COL_ID_QUESTION, COL_NUM_QUESTION, COL_URL, 'error']).to_csv(
        ERRORS_PATH, sep=';', encoding='utf-8', index=False
    )

print('Уже обработано (по прогрессу):', len(processed_keys))


Уже обработано (по прогрессу): 0


In [None]:
from faster_whisper import WhisperModel

compute_type = 'float16' if (DEVICE == 'cuda' and FP16) else 'int8'
model = WhisperModel(
    WHISPER_MODEL_NAME,
    device=DEVICE,
    compute_type=compute_type,
    cpu_threads=THREAD_COUNT,
    num_workers=NUM_WORKERS,
)
print(f"Загружена модель (faster-whisper): {WHISPER_MODEL_NAME} на {DEVICE}, compute_type={compute_type}, cpu_threads={THREAD_COUNT}, num_workers={NUM_WORKERS}")


100%|██████████████████████████████████████| 1.42G/1.42G [00:03<00:00, 390MiB/s]


Загружена модель: medium на cuda


In [None]:
import tempfile
import requests
import os
import pandas as pd
import sys
import urllib3
import warnings

# Подавляем все warnings от urllib3 (включая connection pool warnings)
urllib3.disable_warnings()
warnings.filterwarnings('ignore', module='urllib3')

CHUNK_SIZE = 4 * 1024 * 1024  # 4 MB, быстрее скачивание
# Увеличиваем размер пула соединений для параллельной загрузки
adapter = requests.adapters.HTTPAdapter(pool_connections=BATCH_SIZE * 2, pool_maxsize=BATCH_SIZE * 2, max_retries=3)
SESSION = requests.Session()
SESSION.mount('http://', adapter)
SESSION.mount('https://', adapter)


def download_to_temp(url: str, suffix: str = '.mp3', timeout: int = 60) -> str:
    with SESSION.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as f:
            for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)
            temp_path = f.name
    return temp_path


def transcribe_path(model, audio_path: str, language: str = None) -> str:
    # Полностью подавим вывод (stdout и stderr), чтобы скрыть внутренние прогресс-бары
    original_stdout, original_stderr = sys.stdout, sys.stderr
    devnull = open(os.devnull, 'w')
    sys.stdout = devnull
    sys.stderr = devnull
    try:
        segments, info = model.transcribe(
            audio_path,
            language=language,
            task='transcribe',
            beam_size=1,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=VAD_MIN_SILENCE_MS),
            condition_on_previous_text=False,
            temperature=0.0,
        )
        text_parts = []
        for seg in segments:
            text_parts.append(seg.text)
        result_text = ' '.join(text_parts).strip()
    finally:
        sys.stdout = original_stdout
        sys.stderr = original_stderr
        devnull.close()
        
    return result_text


def append_row(row_dict: dict):
    df_out = pd.DataFrame([row_dict], columns=OUTPUT_COLUMNS)
    df_out.to_csv(PROGRESS_PATH, sep=';', encoding='utf-8', index=False, mode='a', header=False)


def append_error(row_dict: dict, error_msg: str):
    cols = [COL_ID_EXAM, COL_ID_QUESTION, COL_NUM_QUESTION, COL_URL, 'error']
    df_err = pd.DataFrame([{**{COL_ID_EXAM: row_dict.get(COL_ID_EXAM, ''),
                               COL_ID_QUESTION: row_dict.get(COL_ID_QUESTION, ''),
                               COL_NUM_QUESTION: row_dict.get(COL_NUM_QUESTION, ''),
                               COL_URL: row_dict.get(COL_URL, '')},
                            'error': str(error_msg)}], columns=cols)
    df_err.to_csv(ERRORS_PATH, sep=';', encoding='utf-8', index=False, mode='a', header=False)


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED

processed = 0
skipped = 0
failed = 0

# Подготовим список задач
tasks = []
for idx, row in df.iterrows():
    id_exam = str(row.get(COL_ID_EXAM, ''))
    id_question = str(row.get(COL_ID_QUESTION, ''))
    num_question = str(row.get(COL_NUM_QUESTION, ''))
    url = str(row.get(COL_URL, '') or '')
    key = (id_exam, id_question, num_question)

    if not url.strip() or key in processed_keys:
        skipped += 1
        continue

    tasks.append({
        'key': key,
        'id_exam': id_exam,
        'id_question': id_question,
        'num_question': num_question,
        'url': url,
    })

# Корректный прогресс: считаем завершённые элементы
t = tqdm(total=len(tasks), desc='Processing', initial=0, unit='file')

# Скользящее окно загрузок: в полёте держим не более BATCH_SIZE задач
with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
    in_flight = {}
    tasks_iter = iter(tasks)

    # Первичная подача задач
    for _ in range(min(BATCH_SIZE, len(tasks))):
        task = next(tasks_iter, None)
        if task is None:
            break
        fut = executor.submit(download_to_temp, task['url'])
        in_flight[fut] = task

    # Пока есть задачи в полёте — обрабатываем по мере готовности и докидываем новые
    while in_flight:
        done, _ = wait(list(in_flight.keys()), return_when=FIRST_COMPLETED)
        for fut in done:
            task = in_flight.pop(fut)
            temp_path = None
            try:
                temp_path = fut.result()
                text = transcribe_path(model, temp_path, language=LANGUAGE)
                append_row({
                    COL_ID_EXAM: task['id_exam'],
                    COL_ID_QUESTION: task['id_question'],
                    COL_NUM_QUESTION: task['num_question'],
                    COL_TRANSCRIPT: text,
                    COL_URL: task['url'],
                })
                processed_keys.add(task['key'])
                processed += 1
            except Exception as e:
                failed += 1
                append_error({
                    COL_ID_EXAM: task['id_exam'],
                    COL_ID_QUESTION: task['id_question'],
                    COL_NUM_QUESTION: task['num_question'],
                    COL_URL: task['url'],
                }, str(e))
            finally:
                if temp_path and os.path.exists(temp_path):
                    try:
                        os.remove(temp_path)
                    except Exception:
                        pass
            t.update(1)

            # Поддерживаем окно: добавляем следующую задачу
            next_task = next(tasks_iter, None)
            if next_task is not None:
                nfut = executor.submit(download_to_temp, next_task['url'])
                in_flight[nfut] = next_task


t.close()
print('Готово. processed:', processed, 'skipped:', skipped, 'failed:', failed)


In [None]:
import pandas as pd

n_total = len(df)

n_done = 0
if PROGRESS_PATH.exists():
    n_done = len(pd.read_csv(PROGRESS_PATH, sep=';', encoding='utf-8', dtype=str))

n_errors = 0
if ERRORS_PATH.exists():
    n_errors = len(pd.read_csv(ERRORS_PATH, sep=';', encoding='utf-8', dtype=str))

print(f'Всего строк в исходнике: {n_total}')
print(f'Успешно транскрибировано: {n_done}')
print(f'Ошибок: {n_errors}')
print(f'Осталось: {max(n_total - n_done, 0)}')
