# Transcription using Whisper in Google Colab

This notebook needs 2 folders in the root of your Google Drive in order to work:
- lecture_transcriptions
- lecture_audio_files

Fill them with your desired audio files and lecture_transcriptions will start to fill up.

As a solution to Google Colab timing out for free users the code checks if there is an existing transcription with the same name before transcribing. That way you can always press ***Runtime -> Run All*** without a worry.

In [1]:
from google.colab import files
from datetime import timedelta
from glob import glob
from pathlib import Path
import tqdm

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def save_transcription(whisper_result, transcription_name):
  first_line = True

  with open(f"drive/MyDrive/lecture_transcriptions/{transcription_name}.csv", "w") as f:
    if first_line:
      first_line = False
      f.write('Speech;Timestamp\n')
      
    for segment in whisper_result["segments"]:
      speech_id = segment["id"]
      speech_text = segment["text"]
      start = segment["start"]
      end = segment["end"]

      speech_time = f"{str(timedelta(seconds = start))} - {str(timedelta(seconds = end))}"
      text_line = f"{speech_text};{speech_time}\n".lstrip()
      f.write(text_line)

  with open(f"drive/MyDrive/lecture_transcriptions/{transcription_name}.txt", "w") as f:
    f.write(whisper_result["text"])

  # files.download(f"drive/MyDrive/lecture_transcriptions/{transcription_name}.tsv")

## Install Whisper from github

In [None]:
! pip install git+https://github.com/openai/whisper.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 KB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Building wheel for lit (setup.py) ... [?25l[?25hdone


## Load Whisper speech recognition model
Using *large-v2* model which is recommended for best results.

Processing 45 minutes of audio takes around 8 minutes.

In [None]:
import whisper
import torch

model = whisper.load_model("large-v2")#, device="cuda")
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
model.device
# model = whisper.load_model("base", device="cuda")

## Check that we are using GPU

You should see the output `device(type='cuda', index=0)` below. If you don't, you may be on a CPU-only Colab instance which will run a lot slower. Go to `Runtime->Change Runtime Type` to fix this.

In [None]:
model.device

## Transcribe the audio file

In [None]:
audio_files = glob("drive/MyDrive/lecture_audio_files/*")

In [None]:
transcription_files =  [f"{Path(audio_file).stem}" for audio_file in audio_files]

In [None]:
language = "hrv"

In [None]:
for audio_file, transcription_file in zip(audio_files, transcription_files):
  if Path(f"drive/MyDrive/lecture_transcriptions/{transcription_file}_{language}.csv").exists():
    print(f"Skipping {transcription_file}_{language}")
    continue

  print(f"Audio file: {audio_file}")
  print(f"Transcription file: {transcription_file}_{language}")
  result = model.transcribe(f"{audio_file}", verbose=True, language="hr")#, task="translate")
  save_transcription(result, f"{transcription_file}_{language}")

In [None]:
for file in glob("drive/MyDrive/lecture_transcriptions/*.csv"):
  files.download(file)