In [1]:
!pip install -q whisper-timestamped onnxruntime torchaudio

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os, re, datetime, torch
import whisper_timestamped as whisper

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [4]:
# notebook options

recording_path = "/content/drive/MyDrive/Recording/input"
output_file = "transcript.txt"
# possible sizes are tiny, base, small, medium and large
# there are also english-only models with tiny.en, medium.en and such
whisper_model = "tiny" # change this to medium for it to be good, I'm just testing

In [5]:
class TimeScribe:
    def __init__(self, recording_folder: str,
                 output_name: str = "transcript.txt",
                 model_size: str = "medium"):

        if torch.cuda.is_available():
            self.device = "cuda"
            print("Using GPU")
        else:
            self.device = "cpu"
            print("Using CPU")

        self.recording_folder = recording_folder
        self.output_name = output_name
        self.model = whisper.load_model(model_size, self.device)
        self.name_pattern = r"-(\w+)_"

    def transcribe_folder(self) -> None:

        # get all files in input path
        files = os.listdir(self.recording_folder)

        # speaker_segments = dict()
        transcript = list()

        # iterate through files
        for file in files:
            # if file is an audio file, transcribe it
            if file.endswith(".wav") or file.endswith(".flac"):
                audio = whisper.load_audio(os.path.join(self.recording_folder, file))
                result = whisper.transcribe(self.model, audio, language="en", vad=True)
                for segment in result["segments"]:
                    name_match = re.search(self.name_pattern, file)
                    segment_time = float(segment["start"])
                    if name_match:
                      transcript.append((segment_time,
                                         name_match.group(1),
                                         segment["text"],
                                         str(datetime.timedelta(seconds=round(segment_time)))))
                    else:
                      transcript.append((float(segment["start"]),
                                         file, segment["text"],
                                         str(datetime.timedelta(seconds=round(segment_time)))))

            # otherwise, ignore it
            else:
                print(f"Skipping {file}")

        transcript.sort(key=lambda a: a[0])
        # write transcript to file in sorted order with timestamps
        with open(f"{self.recording_folder}/{self.output_name}", "w") as f:
            for segment in transcript:
                f.write(f"[{segment[3]}] {segment[1]}: {segment[2]}\n")

In [6]:
timescribe = TimeScribe(recording_folder=recording_path,
                        output_name=output_file,
                        model_size=whisper_model)
timescribe.transcribe_folder()

Using GPU


100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 103MiB/s]
100%|██████████| 109973/109973 [00:45<00:00, 2443.23frames/s]
100%|██████████| 95584/95584 [00:44<00:00, 2129.68frames/s]
100%|██████████| 7483/7483 [00:01<00:00, 4260.62frames/s]


Skipping info.txt


100%|██████████| 58302/58302 [00:19<00:00, 3031.87frames/s]
100%|██████████| 6474/6474 [00:02<00:00, 2348.43frames/s]
100%|██████████| 7945/7945 [00:02<00:00, 3494.76frames/s]
100%|██████████| 29972/29972 [00:11<00:00, 2525.87frames/s]
 99%|█████████▊| 21592/21902 [00:06<00:00, 3295.16frames/s]
100%|██████████| 3019/3019 [00:01<00:00, 2208.92frames/s]
100%|██████████| 10811/10811 [00:03<00:00, 2739.87frames/s]

Skipping raw.dat



