# Speech-to-Text Conversion Script

This script uses the OpenAI's Whisper model for speech-to-text tasks, handling command-line arguments for audio input and output paths.

In [None]:
# Import necessary libraries
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import argparse
    

In [None]:
# Main execution block
if __name__ == "__main__":
    # Argument parser setup
    parser = argparse.ArgumentParser()
    parser.add_argument("--audio_path", type=str, default="")
    parser.add_argument("--output_path", type=str, default="")
    args = parser.parse_args()

    # Setting device for model computation
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    # Model and processor setup
    model_id = "openai/whisper-large-v3"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    # Pipeline setup for automatic speech recognition
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=256,
        chunk_length_s=30,
        batch_size=4,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    # Load and process the audio file
    audio, sr = torchaudio.load(args.audio_path)
    audio = audio.squeeze().numpy()

    # Get results and write to file
    result = pipe(audio)
    with open(args.output_path, "w") as f:
        f.write(result["text"].strip())

    # Print the transcribed text
    print(result["text"].strip())
    