In [3]:
import os
os.environ['HF_HOME'] = '/data/iivanova-23/cache/'

In [7]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from scipy.io import wavfile
import os
import json

# Auxiliary Function to Update JSON File
def update_json_file(json_file_path, new_data):
    """Safely updates a JSON file with new data."""
    existing_data = {}
    if os.path.exists(json_file_path):
        with open(json_file_path, "r") as file:
            try:
                existing_data = json.load(file)
            except json.JSONDecodeError:
                pass

    existing_data.update(new_data)

    with open(json_file_path, "w") as file:
        json.dump(existing_data, file, indent=4)


def run_asr_pipeline(transcription_model_id, audio_files, transcriptions_path, batch_size=15):
    """Runs an Automatic Speech Recognition (ASR) pipeline."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        transcription_model_id, torch_dtype=torch_dtype, use_safetensors=True
    ).to(device)

    processor = AutoProcessor.from_pretrained(transcription_model_id)

    asr_pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=16,
        chunk_length_s=16,
        batch_size=batch_size,
        torch_dtype=torch_dtype,
        device=device,
        generate_kwargs={"language": "english"},
    )

    # Process files in batches
    for batch_start in range(0, len(audio_files), batch_size):
        batch_files = audio_files[batch_start : batch_start + batch_size]
        transcriptions = {}

        for file_name in batch_files:
            rate, audio_array = wavfile.read(file_name)
            transcription = asr_pipe(audio_array)
            transcriptions[os.path.basename(file_name)] = transcription["text"]

        update_json_file(transcriptions_path, transcriptions)

    return transcriptions


def join_meta_transcriptions(transcriptions, meta_data_path_txt, output_path):
    """Joins transcriptions with metadata from a .txt file."""
    with open(meta_data_path_txt, "r") as file:
        lines = [line.strip().split(";") for line in file if line.strip()]

    if not lines:
        raise ValueError("Metadata file is empty or incorrectly formatted.")

    header, *rows = lines
    metadata = [dict(zip(header, row)) for row in rows]

    joined_data = [
        {
            "file": item["file"],
            "speaker": item.get("speaker"),
            "label": item.get("label"),
            "transcription": transcriptions.get(item["file"], "")
        }
        for item in metadata if item["file"] in transcriptions
    ]

    with open(output_path, "w") as file:
        json.dump(joined_data, file, indent=4)

    return joined_data






In [None]:
transcription_model_id = "openai/whisper-large-v3"
file_1 = "/data/amathur-23/DADA/ASVspoof2021_DF_eval/flac/DF_E_2059712.flac"
file_2 = "/data/amathur-23/DADA/ASVspoof2021_DF_eval/flac/DF_E_2434414.flac"
audio_files = [file_1, file_2]
transcription_path_asv = "/data/iivanov/ASVspoof2021_DF_eval_transcriptions.json"
meta_data_path_asv = "/data/amathur-23/DADA/ASVspoof2021_DF_eval/keys/DF/CM/trial_metadata.txt"
asv_spoof = "/data/iivanova-23/ASVspoof2021_DF_eval.json"

transcriptions = run_asr_pipeline(transcription_model_id, audio_files, transcription_path_asv)
join_meta_transcriptions(transcriptions, meta_data_path_asv, asv_spoof)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 25.88 MiB is free. Process 3183329 has 518.00 MiB memory in use. Process 733671 has 7.84 GiB memory in use. Process 591311 has 308.00 MiB memory in use. Process 603656 has 308.00 MiB memory in use. Process 4193517 has 20.27 GiB memory in use. Including non-PyTorch memory, this process has 2.48 GiB memory in use. Of the allocated memory 2.02 GiB is allocated by PyTorch, and 170.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 