# Installing WhisperX

The command below will install the Python packages needed to use WhisperX Speech To Text (STT) package

In [1]:
# ! pip install git+https://github.com/m-bain/whisperx.git

In [2]:
import os
import torch
import whisperx

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## Transcribe

The following will take a few minutes to transcribe.

In [4]:
language = "ca"
model_size = "large-v3"  # "large-v3" recommended for production
batch_size = 16          # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

In [5]:
model = whisperx.load_model(model_size, device, language=language, compute_type=compute_type)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../veu5/jadrian/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.


In [6]:
# audio_path = "../ccma/data/6255130.mp4"
audio_path = "../ccma/data2/original_3.mp4"
audio = whisperx.load_audio(audio_path)
result = model.transcribe(audio, batch_size=batch_size, language=language)

## Audio segments (before alignment)

In [19]:
max_segments = 30  # max number of segments to print
for segment in (result["segments"][:max_segments//2] + result["segments"][-max_segments//2:]):
    print(f"{segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\"")

  11.15,  15.47, "Jo em dic Sara Montes."
  15.49,  26.64, "Tu t'has de posar més cap aquí, Marta."
  26.66,  27.40, "Aquí?"
  27.46,  27.60, "Sí."
  27.62,  31.18, "Jo t'aviso... Aquí no m'entres."
  31.20,  33.00, "I si tu la mires amb ella, esteu perfectes."
  33.02,  33.28, "Sara Montes."
  33.30,  33.34, "Sí."
  33.36,  39.14, "A veure, nosaltres, una parella normal i corrent, un dia li va començar a fer mal l'esquena..."
  41.70,  44.88, "Ja està."
  44.90,  45.20, "Perdona, eh?"
  45.24,  45.58, "Ah, no, no, digue'm."
  45.60,  47.68, "Jo no sé, o sigui, començo una mica com al principi o no?"
  47.72,  53.84, "És que no sé... Sí, sí, sí, jo et demà preguntava, una mica, perquè estem aquí, què va passar."
  53.90,  66.59, "Doncs nosaltres, a veure, portàvem vuit anys junts, volíem tenir fills, justament acabàvem de donar fins i tot les arres per al pis que ens havien de comprar, i un dia li va començar a fer mal l'esquena."
3363.33,3365.52, "Clar, perquè és que quan estàs així t

## 2. Align transcription

In [8]:
align_language = result["language"]
model_a, metadata = whisperx.load_align_model(language_code=align_language, device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
result["language"] = align_language

## Audio segments (after alignment)

In [16]:
max_segments = 30  # max number of segments to print
for segment in (result["segments"][:max_segments//2] + result["segments"][-max_segments//2:]):
    print(f"{segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\"")

  11.15,  15.47, "Jo em dic Sara Montes."
  15.49,  26.64, "Tu t'has de posar més cap aquí, Marta."
  26.66,  27.40, "Aquí?"
  27.46,  27.60, "Sí."
  27.62,  31.18, "Jo t'aviso... Aquí no m'entres."
  31.20,  33.00, "I si tu la mires amb ella, esteu perfectes."
  33.02,  33.28, "Sara Montes."
  33.30,  33.34, "Sí."
  33.36,  39.14, "A veure, nosaltres, una parella normal i corrent, un dia li va començar a fer mal l'esquena..."
  41.70,  44.88, "Ja està."
  44.90,  45.20, "Perdona, eh?"
  45.24,  45.58, "Ah, no, no, digue'm."
  45.60,  47.68, "Jo no sé, o sigui, començo una mica com al principi o no?"
  47.72,  53.84, "És que no sé... Sí, sí, sí, jo et demà preguntava, una mica, perquè estem aquí, què va passar."
  53.90,  66.59, "Doncs nosaltres, a veure, portàvem vuit anys junts, volíem tenir fills, justament acabàvem de donar fins i tot les arres per al pis que ens havien de comprar, i un dia li va començar a fer mal l'esquena."
3363.33,3365.52, "Clar, perquè és que quan estàs així t

## 3. Assign speaker labels

In [10]:
diarize_model = whisperx.DiarizationPipeline(device=device)

# add min/max number of speakers if known
# diarize_segments = diarize_model(audio, min_speakers=1, max_speakers=3)
diarize_segments = diarize_model(audio)

result = whisperx.assign_word_speakers(diarize_segments, result)

In [18]:
# segments are now assigned speaker IDs
max_segments = 30  # max number of segments to print
for segment in result["segments"][:max_segments]:
    print(f"{segment['speaker']}, {segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\"")

SPEAKER_04,   11.15,  15.47, "Jo em dic Sara Montes."
SPEAKER_00,   15.49,  26.64, "Tu t'has de posar més cap aquí, Marta."
SPEAKER_00,   26.66,  27.40, "Aquí?"
SPEAKER_00,   27.46,  27.60, "Sí."
SPEAKER_00,   27.62,  31.18, "Jo t'aviso... Aquí no m'entres."
SPEAKER_06,   31.20,  33.00, "I si tu la mires amb ella, esteu perfectes."
SPEAKER_06,   33.02,  33.28, "Sara Montes."
SPEAKER_06,   33.30,  33.34, "Sí."
SPEAKER_06,   33.36,  39.14, "A veure, nosaltres, una parella normal i corrent, un dia li va començar a fer mal l'esquena..."
SPEAKER_06,   41.70,  44.88, "Ja està."
SPEAKER_06,   44.90,  45.20, "Perdona, eh?"
SPEAKER_06,   45.24,  45.58, "Ah, no, no, digue'm."
SPEAKER_06,   45.60,  47.68, "Jo no sé, o sigui, començo una mica com al principi o no?"
SPEAKER_04,   47.72,  53.84, "És que no sé... Sí, sí, sí, jo et demà preguntava, una mica, perquè estem aquí, què va passar."
SPEAKER_06,   53.90,  66.59, "Doncs nosaltres, a veure, portàvem vuit anys junts, volíem tenir fills, justamen

## Audio words

In [12]:
def print_words(max_words: int = 20) -> None:
    nwords = 0
    for segment in result["segments"]:
        for word in segment["words"]:
            print(f"{word['start']:7.2f},{word['end']:7.2f}, \"{word['word'].strip()}\"")
            nwords += 1
            if nwords == max_words:
                return

print_words()

  11.15,  12.59, "Jo"
  14.33,  15.13, "em"
  15.15,  15.23, "dic"
  15.25,  15.33, "Sara"
  15.35,  15.47, "Montes."
  15.49,  15.59, "Tu"
  15.61,  15.81, "t'has"
  15.83,  15.91, "de"
  15.93,  23.00, "posar"
  23.14,  23.72, "més"
  23.74,  24.96, "cap"
  24.98,  26.10, "aquí,"
  26.30,  26.64, "Marta."
  26.66,  27.40, "Aquí?"
  27.46,  27.60, "Sí."
  27.62,  27.70, "Jo"
  27.76,  30.62, "t'aviso..."
  30.64,  30.90, "Aquí"
  30.92,  30.96, "no"
  30.98,  31.18, "m'entres."


## Save subtitles in VTT format

In [13]:
output_format = "vtt"  # "txt", "vtt", "srt", "tsv", "json", or "all"
output_dir = ""
writer = whisperx.utils.get_writer(output_format, output_dir)
writer_args = {"highlight_words": False, "max_line_count": None, "max_line_width": None}
result["language"] = align_language
writer(result, audio_path, writer_args)

In [14]:
def save_words(result, output_dir, audio_path) -> None:
    audio_basename = os.path.basename(audio_path)
    audio_basename = os.path.splitext(audio_basename)[0]
    output_path = os.path.join(output_dir, audio_basename + ".csv")
    with open(output_path, "w") as f:
        print("start,end,word", file=f)
        prev_end = 0
        for segment in result["segments"]:
            for word in segment["words"]:
                if 'start' in word:
                    print(f"{word['start']},{word['end']},\"{word['word'].strip()}\"", file=f)
                    prev_end = word['end']
                else:
                    print(f"{prev_end},{prev_end},\"{word['word'].strip()}\"")
                    print(f"{prev_end},{prev_end},\"{word['word'].strip()}\"", file=f)

    return output_path

timestamp_path = save_words(result, output_dir, audio_path)
!head $timestamp_path

101.472,101.472,"33"
437.57,437.57,"5"
445.417,445.417,"3"
930.769,930.769,"40"
931.169,931.169,"50"
1160.968,1160.968,"13"
1409.01,1409.01,"100%"
1647.098,1647.098,"45"
2231.826,2231.826,"4"
2241.311,2241.311,"8"
2241.651,2241.651,"4"
2243.212,2243.212,"4"
2289.087,2289.087,"7"
2291.888,2291.888,"6"
2315.719,2315.719,"62,"
2315.719,2315.719,"63,"
2329.844,2329.844,"7"
2557.433,2557.433,"5"
2675.307,2675.307,"33"
2998.765,2998.765,"7"
3225.14,3225.14,"49"
start,end,word
11.152,12.593,"Jo"
14.333,15.134,"em"
15.154,15.234,"dic"
15.254,15.334,"Sara"
15.354,15.474,"Montes."
15.494,15.594,"Tu"
15.614,15.814,"t'has"
15.834,15.914,"de"
15.934,22.997,"posar"
