<a href="https://colab.research.google.com/github/ollisulopuisto/hindenburg-helpers/blob/master/Litterointi_%E2%80%93_Hindenburg_w_Faster_Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jaksomedian litterointi Hindenburgia varten: Faster-Whisper v 2

Luotu 6.3.2024. Pohjana Whisper for Podcasters v0.2 Oct 2, 2022 by [@UncannyRobot](https://twitter.com/UncannyRobot)



In [None]:
# @title Alusta litterointiympäristö
from IPython.display import clear_output
! apt install libcublas11
! pip install CTranslate2==4.5.0
# pip install --force-reinstall ctranslate2==3.24.0
# ! pip install git+https://github.com/federicotorrielli/BetterWhisperX

! pip install -U whisper-ctranslate2
from google.colab import drive
drive.mount('/content/drive')
# clear_output()
print('Valmista on, etiäpäin.')


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libcublas11 is already the newest version (11.7.4.6~11.5.1-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Mounted at /content/drive
Valmista on, etiäpäin.


In [None]:
# @title Muokkaa tähän oikeat kansiot.
input_dir = '/content/drive/MyDrive/whisper/input' # Drive-kansio, johon lataat käsiteltävät tiedostot
output_dir = '/content/drive/MyDrive/whisper/output/' # Drive-kansio, johon litteroinnit sisältävä .nhsx-tiedosto tallentuu

In [None]:
# @title Litteroi
# This is a Python script that reads all specified audio files in the input directory,
# uses Whisper to transcribe them into JSON files,
# then transforms the JSON timestamps into .nhsx XML format
# and inserts them in the .nhsx file

import os
import re
import json
from xml.etree import ElementTree

def hms_to_seconds(time_str):
    hours, minutes, seconds_milliseconds = time_str.split(':')
    seconds, milliseconds = seconds_milliseconds.split(',')
    return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + float(milliseconds.replace(',', '.')) / 1000

def generate_srt_files(input_dir, output_dir):
    transcripts_dir = os.path.join(output_dir, 'transcripts')
    os.makedirs(transcripts_dir, exist_ok=True)

    audio_extensions = ('.wav', '.aiff', '.flac', '.m4a', '.mp4', '.mp3') # Added new extensions here

    # Recursive walk through all subdirectories
    for dirpath, dirnames, filenames in os.walk(input_dir):
        for filename in filenames:
            if filename.lower().endswith(audio_extensions): # Modified to check against tuple of extensions
                full_path = os.path.join(dirpath, filename)
                # Keep the filename's case. Just replace the audio extension with .json in lower case
                output_path = os.path.join(transcripts_dir, re.sub(r'\.(' + '|'.join(audio_extensions[0:]) + ')$', '.json', filename, flags=re.IGNORECASE))

                # Check if transcription already exists in .nhsx file
                if not is_file_already_transcribed(full_path, input_dir):
                    # If not transcribed, check if the JSON file exists
                    if not os.path.isfile(output_path):
                        print(f"Litteroidaan tiedostoa {full_path}")
                        !whisper-ctranslate2 "{full_path}" --batched True --compute_type auto --word_timestamps True --max_line_width 33 --max_line_count 2  --vad_filter True --model turbo --language fi --output_dir "{transcripts_dir}"
                    else:
                        print(f"Ohitetaan tiedosto '{output_path}', koska se on jo olemassa.")
                else:
                    print(f"Ohitetaan tiedosto '{full_path}', koska se on jo litteroitu.")

def process_files(input_dir, output_dir):
    generate_srt_files(input_dir, output_dir)
    for filename in os.listdir(input_dir):
        if filename.lower().endswith(".nhsx"):
            with open(os.path.join(input_dir, filename), 'r') as f:
                nhsx_data = f.read()
            xml_elems = ElementTree.fromstring(nhsx_data)
            for session_elem in xml_elems:
                if 'AudioPool' in session_elem.tag:
                    for audio_pool_elem in session_elem:
                        if 'File' in audio_pool_elem.tag:
                            file_elem_name = audio_pool_elem.get('Name')
                            if file_elem_name:
                                # Modified to handle different audio extensions when creating srt_filename
                                audio_extensions_for_replace = ('.wav', '.aiff', '.flac', '.m4a', '.mp4')
                                srt_filename = file_elem_name # Removed .lower() here
                                for ext in audio_extensions_for_replace:
                                    if srt_filename.lower().endswith(ext): # Keep lower here for extension check
                                        srt_filename = srt_filename.replace(ext, '.json')
                                        break # Exit loop once replaced

                                transcripts_dir = os.path.join(output_dir, 'transcripts')
                                transcript_files = os.listdir(transcripts_dir)
                                # transcript_files_lower = [file.lower() for file in transcript_files] # Removed lowercasing
                                # if srt_filename in transcript_files_lower: # Changed the check
                                if srt_filename in transcript_files: # Case-sensitive check, or can be replaced with case-insensitive if needed
                                    # index = transcript_files_lower.index(srt_filename) # No longer needed
                                    # actual_srt_filename = transcript_files[index] # No longer needed
                                    actual_srt_filename = srt_filename # Filename should be directly matching now
                                    try:
                                        with open(os.path.join(transcripts_dir, actual_srt_filename), 'r') as f:
                                            srt_data = json.load(f)
                                    except FileNotFoundError:
                                        print(f"Tiedostoa ei löydy: '{srt_filename}'. Ohitetaan…")
                                        continue
                                else:
                                    print(f"No transcript found for '{srt_filename}'. Skipping...")
                                    continue # Skip this loop iteration if srt file wasn't found
                                transcription_elem = audio_pool_elem.find('Transcription')
                                if transcription_elem is not None:
                                    print(f"Tiedosto '{srt_filename}' on jo litteroitu, ei lisätä sitä.")
                                    continue
                                transcription_elem = ElementTree.SubElement(audio_pool_elem, 'Transcription')

                                p_elem = ElementTree.SubElement(transcription_elem, 'p')
                                for segment in srt_data['segments']:
                                    for word in segment['words']:
                                        sentence = word['word'].strip()
                                        start = word['start']
                                        end = word['end']
                                        word_elem = ElementTree.SubElement(p_elem, 'w')
                                        word_elem.set('l', str(end-start))
                                        word_elem.set('s', str(start))
                                        word_elem.set('sp', 'UU')
                                        word_elem.text = sentence

                                nhsx_tree = ElementTree.ElementTree(xml_elems)
                                output_filename = filename.replace('.nhsx', ' litteroitu.nhsx')
                                nhsx_tree.write(os.path.join(output_dir, output_filename), encoding='unicode')
                                print(f"Lisätään tekstitys '{srt_filename}' tiedostoon '{output_filename}'")
                            else:
                                print(f"No 'File' element found in {filename}. Skipping...")

def is_file_already_transcribed(wav_file_path, input_dir): # Function name is a bit misleading now as it handles more than wav
    """
    Checks if an audio file has already been transcribed by examining the corresponding .nhsx file.

    Args:
        wav_file_path (str): Path to the audio file.
        input_dir (str): Path to the directory containing the .nhsx files.

    Returns:
        bool: True if the file has been transcribed, False otherwise.
    """
    filename = os.path.basename(wav_file_path)
    nhsx_filename = filename.replace(os.path.splitext(filename)[1], '.nhsx', 1) # Replace extension with .nhsx
    nhsx_file_path = os.path.join(input_dir, nhsx_filename)

    if os.path.isfile(nhsx_file_path):
        with open(nhsx_file_path, 'r') as f:
            nhsx_data = f.read()
        xml_elems = ElementTree.fromstring(nhsx_data)
        for session_elem in xml_elems:
            if 'AudioPool' in session_elem.tag:
                for audio_pool_elem in session_elem:
                    if 'File' in audio_pool_elem.tag and audio_pool_elem.get('Name') == filename:
                        if audio_pool_elem.find('Transcription') is not None:
                            return True
    return False

process_files(input_dir, output_dir)
print("\nValmista tuli.")

Litteroidaan tiedostoa /content/drive/MyDrive/whisper/input/vst s10e06 - 2025-06-18 Files/Olli.wav
model.bin:   0% 0.00/1.62G [00:00<?, ?B/s]
config.json: 100% 2.26k/2.26k [00:00<00:00, 13.9MB/s]

preprocessor_config.json: 100% 340/340 [00:00<00:00, 2.71MB/s]

tokenizer.json:   0% 0.00/2.71M [00:00<?, ?B/s][A

model.bin:   1% 21.0M/1.62G [00:00<00:11, 140MB/s]
tokenizer.json: 100% 2.71M/2.71M [00:00<00:00, 17.7MB/s]


vocabulary.json: 100% 1.07M/1.07M [00:00<00:00, 7.29MB/s]
model.bin: 100% 1.62G/1.62G [00:07<00:00, 204MB/s]
Detected language 'Finnish' with probability 1.000000
[00:45.380 --> 00:46.780]  Ihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihihi