# Installing Whisper

The commands below will install the Python packages needed to use Whisper Speech To Text (STT) package

In [1]:
# ! pip install git+https://github.com/openai/whisper.git
# ! sudo apt install ffmpeg

In [2]:
import os
import torch
import whisper

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


## Running inference

The following will take a few minutes to transcribe.

In [4]:
language = "Catalan"
model_size = "large-v3"  # "large-v3" recommended for production
model = whisper.load_model(model_size)

In [5]:
transcribe_options = dict(task="transcribe", language=language, beam_size=5, best_of=5, word_timestamps=True)

In [6]:
# audio_path = "../ccma/data/6255130.mp4"
audio_path = "../ccma/data2/original_3.mp4"
transcription = model.transcribe(audio_path, **transcribe_options)

In [7]:
transcription["text"]

" I en primer el teu nom i cognoms. Doncs jo em dic Sara Montes. Tu t'has de posar més cap aquí, Marta. Aquí? Sí. Jo t'aviso... Aquí no m'entres. Vale. I si tu la mires amb ella, esteu perfectes. Sara Montes. Sí. Bé, a veure, nosaltres... Bé, una parella normal i corrent, no? Un dia li va començar a fer mal l'esquena... Perdona, eh? Ah, no, no, digue'm. Jo no sé, o sigui, començo una mica com al principi o no? Sí, sí, sí. T'he de preguntar, una mica, perquè estem aquí, què va passar. Doncs nosaltres, a veure, portàvem vuit anys junts, volíem tenir fills, justament acabàvem de donar filles, fins i tot les arres per al pis que ens havien de comprar, i un dia li va començar a fer mal l'esquena. En principi, una cosa normal i corrent, com li pot passar a qualsevol persona, però va anar a més, a més, a més, a més, cada dia li feia més mal, fins que un dia vam anar a l'hospital perquè ens diguessin una mica, oi, què passa, no?, que és un anunval, jo, què és? I ja ha arribat aquest moment, qu

## Audio segments

In [8]:
max_segments = 10  # max number of segments to print
for segment in transcription["segments"][:max_segments]:
    print(f"{segment['start']:7.2f},{segment['end']:7.2f}, \"{segment['text'].strip()}\"")

  11.80,  14.24, "I en primer el teu nom i cognoms."
  14.74,  16.30, "Doncs jo em dic Sara Montes."
  17.76,  19.50, "Tu t'has de posar més cap aquí, Marta."
  20.10,  20.50, "Aquí?"
  20.66,  20.96, "Sí."
  21.14,  23.46, "Jo t'aviso... Aquí no m'entres."
  23.70,  23.90, "Vale."
  24.16,  27.66, "I si tu la mires amb ella, esteu perfectes."
  28.04,  29.02, "Sara Montes."
  29.10,  29.44, "Sí."


## Audio words

In [9]:
def print_words(max_words: int = 20) -> None:
    nwords = 0
    for segment in transcription["segments"]:
        for word in segment["words"]:
            print(f"{word['start']:7.2f},{word['end']:7.2f}, \"{word['word'].strip()}\"")
            nwords += 1
            if nwords == max_words:
                return

print_words()

  11.80,  12.20, "I"
  12.20,  12.38, "en"
  12.38,  12.56, "primer"
  12.56,  12.82, "el"
  12.82,  12.94, "teu"
  12.94,  13.16, "nom"
  13.16,  13.78, "i"
  13.78,  14.24, "cognoms."
  14.74,  15.14, "Doncs"
  15.14,  15.34, "jo"
  15.34,  15.46, "em"
  15.46,  15.56, "dic"
  15.56,  15.76, "Sara"
  15.76,  16.30, "Montes."
  17.76,  18.16, "Tu"
  18.16,  18.30, "t"
  18.30,  18.44, "'has"
  18.44,  18.46, "de"
  18.46,  18.74, "posar"
  18.74,  18.88, "més"


## Save subtitles in VTT format

In [10]:
output_format = "vtt"  # "txt", "vtt", "srt", "tsv", "json", or "all"
output_dir = ""
writer = whisper.utils.get_writer(output_format, output_dir)
writer_args = {"highlight_words": False, "max_line_count": None, "max_line_width": None, "max_words_per_line": None}
writer(transcription, audio_path, **writer_args)

In [11]:
def save_words(transcription, output_dir, audio_path) -> None:
    audio_basename = os.path.basename(audio_path)
    audio_basename = os.path.splitext(audio_basename)[0]
    output_path = os.path.join(output_dir, audio_basename + ".csv")
    with open(output_path, "w") as f:
        print("start,end,word", file=f)
        for segment in transcription["segments"]:
            for word in segment["words"]:
                print(f"{word['start']},{word['end']},\"{word['word'].strip()}\"", file=f)

    return output_path

timestamp_path = save_words(transcription, output_dir, audio_path)
!head $timestamp_path

start,end,word
11.799999999999997,12.2,"I"
12.2,12.38,"en"
12.38,12.56,"primer"
12.56,12.82,"el"
12.82,12.94,"teu"
12.94,13.16,"nom"
13.16,13.78,"i"
13.78,14.24,"cognoms."
14.74,15.14,"Doncs"
