# Transcribe Audio

In this script, we load the audio file and transcribe it.

### Option 1: Process a single file

Select a single audio file as input and specify the output (JSON and text file).

In [3]:
import os 

In [8]:
from pathlib import Path
import os

def read_huggingface_token(token_file: str) -> str | None:
    """
    Reads the Hugging Face token from a specified file path.
    """
    try:
        # Create a Path object from the given file path
        path = Path(os.path.expanduser(token_file))
        
        # Check if the file exists before trying to read it
        if path.exists():
            return path.read_text().strip()
        else:
            print(f"Error: The token file does not exist at '{path}'.")
            return None
    except Exception as e:
        print(f"An error occurred while reading the token file: {e}")
        return None



In [9]:
# Your script now uses this function
token_path = "~/.cache/huggingface/token"
hf_token = read_huggingface_token(token_path)

if hf_token:
    print("Hugging Face token successfully loaded.")
    # Now you can use hf_token in your function calls
else:
    print("Failed to load Hugging Face token.")

Hugging Face token successfully loaded.


In [10]:
"""
Transcribe audio.wav
"""

import openwillis.transcribe as owt
import json

def transcribe_audio(audio_file, output_json, output_text):
    """Transcribe audio file using OpenWillis"""
    try:
        # Use OpenWillis to transcribe the audio
        # Returns: (transcript_json, transcript_text)
        result = owt.speech_transcription_whisper(
            filepath=audio_file,
            model="large-v2",      # Best model, takes longer
            compute_type="int16",  # Default for CPU
            device_type="cpu",     # Use CPU 
            batch_size=16,         # Default batch size
            hf_token= hf_token,  # Replace with your token
            language="",           # Auto-detection if not specified
            min_speakers = 1,      # Speaker diarization possible
            max_speakers = 1
        )
        
        # Unpack the result
        transcript_json, transcript_text = result
        
        # Save the JSON transcript (detailed word-by-word)
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(transcript_json, f, ensure_ascii=False, indent=2)
        
        # Save the text transcript (simple string)
        with open(output_text, 'w', encoding='utf-8') as f:
            f.write(transcript_text)
        
        print(f" Transcription saved:")
        print(f"  JSON: {output_json}")
        print(f"  Text: {output_text}")
        
    except Exception as e:
        print(f"Error transcribing {audio_file}: {e}")

# Define input and output files
audio_file = "output/RAVDESS/audio_only/01-01-01-01-01-01-01.wav"     # Input audio file from previous step
json_output = "output/RAVDESS/transcripts/01-01-01-01-01-01-01.json"  # Detailed transcript
text_output = "output/RAVDESS/transcripts/01-01-01-01-01-01-01.txt"   # Simple text transcript



In [None]:
# Transcribe the audio
transcribe_audio(audio_file, json_output, text_output)

### Option 2: Process all files in a folder

Select a folder with audio files as input and specify your output folder.

In [None]:
"""
Transcribe all audio files in a directory
"""

import openwillis.transcribe as owt
import json
import os

def transcribe_audio(audio_file, output_json, output_text):
    """Transcribe single audio file using OpenWillis"""
    try:
        # Use OpenWillis to transcribe the audio
        # Returns: (transcript_json, transcript_text)
        result = owt.speech_transcription_whisper(
            filepath=audio_file,
            model="large-v2",      # Best model, takes longer
            compute_type="int16",  # Default for CPU
            device_type="cpu",     # Use CPU 
            batch_size=16,         # Default batch size
            hf_token=hf_token,  # Replace with your token
            language="en",         # Specifying recommended, else Auto-detection
            min_speakers=1,        # Speaker diarization possible
            max_speakers=1
        )
        
        # Unpack the result
        transcript_json, transcript_text = result
        
        # Save the JSON transcript 
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(transcript_json, f, ensure_ascii=False, indent=2)
        
        # Save the text transcript (simple string)
        with open(output_text, 'w', encoding='utf-8') as f:
            f.write(transcript_text)
        
        print(f" Transcription saved:")
        print(f"  JSON: {output_json}")
        print(f"  Text: {output_text}")
        
    except Exception as e:
        print(f"Error transcribing {audio_file}: {e}")

def process_directory(input_dir, json_output_dir, text_output_dir):
    """Transcribe all audio files in directory"""
    # Create output directories if they don't exist
    os.makedirs(json_output_dir, exist_ok=True)
    os.makedirs(text_output_dir, exist_ok=True)
    
    # Loop through all files in the input directory
    for file in os.listdir(input_dir):
        # Only process WAV files
        if not file.endswith(".wav"):
            continue
            
        # Create full paths for input and output files
        audio_file = os.path.join(input_dir, file)
        base_name = os.path.splitext(file)[0]  # Remove .wav extension
        json_output = os.path.join(json_output_dir, f"{base_name}_transcript.json")
        text_output = os.path.join(text_output_dir, f"{base_name}_transcript.txt")
        
        # Skip if transcription files already exist
        if os.path.exists(json_output) and os.path.exists(text_output):
            print(f"Skipping {file}: transcription already exists")
            continue
            
        # Process the audio file
        print(f"Processing {file}...")
        transcribe_audio(audio_file, json_output, text_output)

# Define input and output directories
input_folder = "output/RAVDESS/audio_only" # Replace with your folder
json_output_folder = "output/RAVDESS/transcripts/JSON_files"
text_output_folder = "output/RAVDESS/transcripts/txt_files"

# Process all audio files in the directory
process_directory(input_folder, json_output_folder, text_output_folder)
print("All files processed!")

Processing Example_Video_audio.wav...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../opt/anaconda3/envs/openwillis_3.1/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
 Transcription saved:
  JSON: /mnt/nfs/data/code/openwillis/workshop_10092025/Example_JSON_Folder/Example_Video_audio_transcript.json
  Text: /mnt/nfs/data/code/openwillis/workshop_10092025/Example_Text_Folder/Example_Video_audio_transcript.txt
Processing Example_Audio.wav...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../opt/anaconda3/envs/openwillis_3.1/lib/python3.10/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
 Transcription saved:
  JSON: /mnt/nfs/data/code/openwillis/workshop_10092025/Example_JSON_Folder/Example_Audio_transcript.json
  Text: /mnt/nfs/data/code/openwillis/workshop_10092025/Example_Text_Folder/Example_Audio_transcript.txt
All files processed!
