In [None]:
import whisperx
import gc
import torch
import os

from dotenv import load_dotenv

In [None]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise RuntimeError("HF_TOKEN não encontrado no ambiente")

In [2]:
device = "cuda"
batch_size = 8 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

In [3]:
diarization_dir = "diarization/"
raw_audio_dir = 'audios/raw_m4a/'
wav_audio_dir = 'audios/wav/'
transcripts_dir = 'transcripts/'

In [None]:
for arquivo in os.listdir(raw_audio_dir):

    print("#" * 50)
    print("Iniciando transcrição para:", os.path.join(raw_audio_dir, arquivo))

    # 1. Transcribe with original whisper (batched)
    model = whisperx.load_model("large-v2", "cuda", compute_type=compute_type)

    audio = whisperx.load_audio(os.path.join(raw_audio_dir, arquivo))
    result = model.transcribe(audio, batch_size=batch_size, language='pt', verbose=False)
    # print(result["segments"])  # before alignment

    txt_filename = arquivo.replace('.m4a', '.txt')

    transcript_lines = ''
    for segment in result["segments"]:
        transcript_lines += segment['text'] + '\n'

    with open(os.path.join(transcripts_dir, txt_filename), 'w', encoding='utf-8') as f:
        f.write(transcript_lines)

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result_aligned = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    # print(result_aligned["segments"]) # after alignment

    diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=hf_token, device=device)
    diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=2)

    result_diarization = whisperx.assign_word_speakers(diarize_segments, result_aligned)

    # Write diarization txt
    speaker = ''
    output_lines = []

    for segment in result_diarization["segments"]:
        if segment.get('speaker'):
            if segment['speaker'] != speaker:
                output_lines.append('*' * 50)
                output_lines.append('\n')
                speaker = segment['speaker']
                output_lines.append(f"Speaker {speaker}:")
        output_lines.append(segment['text'])

    # Salva o output em um arquivo .txt
    txt_file_path = diarization_dir + arquivo.replace('.m4a', '.txt')
    with open(txt_file_path, 'w', encoding='utf-8') as f:
        for line in output_lines:
            f.write(str(line) + '\n')

    # delete model if low on GPU resources
    gc.collect(); torch.cuda.empty_cache(); del model

##################################################
Iniciando transcrição para: audios/raw_m4a/Entrevista AD Fátima.m4a


  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...


  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\lgrne\OneDrive\Documents\Codigos\whisper_transcription\whisperx-env\lib\site-packages\whisperx\assets\pytorch_model.bin`
  torchaudio.list_audio_backends()


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu129. Bad things might happen unless you revert torch to 1.x.


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1001.27it/s]
  torchaudio.list_audio_backends()
  std = sequences.std(dim=-1, correction=1)


##################################################
Iniciando transcrição para: audios/raw_m4a/Entrevista AD Josué 2.m4a


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\lgrne\OneDrive\Documents\Codigos\whisper_transcription\whisperx-env\lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu129. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
  torchaudio.list_audio_backends()
  std = sequences.std(dim=-1, correction=1)


##################################################
Iniciando transcrição para: audios/raw_m4a/Entrevista AD Josué.m4a


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\lgrne\OneDrive\Documents\Codigos\whisper_transcription\whisperx-env\lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu129. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 665.02it/s]
  torchaudio.list_audio_backends()
  std = sequences.std(dim=-1, correction=1)


##################################################
Iniciando transcrição para: audios/raw_m4a/Entrevista AD Nilza.m4a


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Users\lgrne\OneDrive\Documents\Codigos\whisper_transcription\whisperx-env\lib\site-packages\whisperx\assets\pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu129. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
  torchaudio.list_audio_backends()
  std = sequences.std(dim=-1, correction=1)
