In [3]:
import os
os.environ['HF_HOME'] = '/data/iivanova-23/cache/'

In [1]:
import kaggle
kaggle.api.authenticate()

In [None]:
import os
import zipfile

!kaggle datasets download -d mohammedabdeldayem/avsspoof-2021 -p /data/iivanova-23/data/

# Unzip the dataset

with zipfile.ZipFile('avsspoof-2021.zip', 'r') as zip_ref:
    zip_ref.extractall('/data/iivanova-23/data/ASVspoof2021_DF_eval')

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from scipy.io import wavfile
import os
import json

# Auxiliary Function to Update JSON File
def update_json_file(json_file_path, new_data):
    """Safely updates a JSON file with new data."""
    existing_data = {}
    if os.path.exists(json_file_path):
        with open(json_file_path, "r") as file:
            try:
                existing_data = json.load(file)
            except json.JSONDecodeError:
                pass

    existing_data.update(new_data)

    with open(json_file_path, "w") as file:
        json.dump(existing_data, file, indent=4)


def run_asr_pipeline(transcription_model_id, audio_files, transcriptions_path, batch_size=15):
    """Runs an Automatic Speech Recognition (ASR) pipeline."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        transcription_model_id, torch_dtype=torch_dtype, use_safetensors=True
    ).to(device)

    processor = AutoProcessor.from_pretrained(transcription_model_id)

    asr_pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=16,
        chunk_length_s=16,
        batch_size=batch_size,
        torch_dtype=torch_dtype,
        device=device,
        generate_kwargs={"language": "english"},
    )

    # Process files in batches
    for batch_start in range(0, len(audio_files), batch_size):
        batch_files = audio_files[batch_start : batch_start + batch_size]
        transcriptions = {}

        for file_name in batch_files:
            rate, audio_array = wavfile.read(file_name)
            transcription = asr_pipe(audio_array)
            transcriptions[os.path.basename(file_name)] = transcription["text"]

        update_json_file(transcriptions_path, transcriptions)

    return transcriptions


def join_meta_transcriptions(transcriptions, meta_data_path_txt, output_path):
    """Joins transcriptions with metadata from a .txt file."""
    with open(meta_data_path_txt, "r") as file:
        lines = [line.strip().split(";") for line in file if line.strip()]

    if not lines:
        raise ValueError("Metadata file is empty or incorrectly formatted.")

    header, *rows = lines
    metadata = [dict(zip(header, row)) for row in rows]

    joined_data = [
        {
            "file": item["file"],
            "speaker": item.get("speaker"),
            "label": item.get("label"),
            "transcription": transcriptions.get(item["file"], "")
        }
        for item in metadata if item["file"] in transcriptions
    ]

    with open(output_path, "w") as file:
        json.dump(joined_data, file, indent=4)

    return joined_data






  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
os.environ['HF_HOME'] = '/data/iivanova-23/cache/'

In [None]:
transcription_model_id = "openai/whisper-large-v3"
file_1 = "/data/iivanova-23/data/asvspoof2021/ASVspoof2021_DF_eval_part00/ASVspoof2021_DF_eval/flac/DF_E_2076475.flac"
file_2 = "/data/iivanova-23/data/asvspoof2021/ASVspoof2021_DF_eval_part00/ASVspoof2021_DF_eval/flac/DF_E_2182986.flac"
audio_files = [file_1, file_2]
transcription_path_asv = "/data/iivanova-23/data/asvspoof2021/transcriptions/ASVspoof2021_DF_eval_transcriptions.json"
meta_data_path_asv = "/data/iivanova-23/data/asvspoof2021/DF-keys-full/keys/DF/CM/trial_metadata.txt"
asv_spoof = "/data/iivanova-23//ASVspoof2021_DF_eval.json"

transcriptions = run_asr_pipeline(transcription_model_id, audio_files, transcription_path_asv)
join_meta_transcriptions(transcriptions, meta_data_path_asv, asv_spoof)

In [21]:
#model

import os
import csv
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch

# Model configuration
model_id = "openai/whisper-large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32

# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device
)

Device set to use cuda


In [20]:

# Meta data
meta_data_path_asv = "/data/iivanova-23/data/asvspoof2021/DF-keys-full/keys/DF/CM/trial_metadata.txt"
metadata = {}
with open(meta_data_path_asv, "r") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 6:
            file_name = parts[1]
            file_type = parts[5]
            metadata[file_name] = file_type

In [17]:
folders = [
    "/data/iivanova-23/data/asvspoof2021/ASVspoof2021_DF_eval_part00",
    # "ASVspoof2021_DF_eval_part01",
    # "ASVspoof2021_DF_eval_part02",
]
output_csv = "/data/iivanova-23/data/asvspoof2021/transcriptions/transcriptions.csv"

In [23]:
from datasets import Dataset, Audio
audio_files = []
for folder in folders:
    flac_dir = os.path.join(folder, "ASVspoof2021_DF_eval/flac")
    if os.path.exists(flac_dir):
        for file_name in os.listdir(flac_dir):
            if file_name.endswith(".flac"):
                file_path = os.path.join(flac_dir, file_name)
                base_name = file_name.replace(".flac", "")
                file_type = metadata.get(base_name, "unknown")
                audio_files.append({"path": file_path, "file_name": file_name, "type": file_type})


dataset = Dataset.from_list(audio_files)
dataset = dataset.cast_column("path", Audio(sampling_rate=16000))

with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["file_name", "type", "transcription"])

    for sample in dataset:
        print(sample["path"]['array'])
        try:
            transcription = pipe(sample["path"]["array"])["text"]
        except Exception as e:
            print(f"Error transcribing {sample['path']}: {e}")
            transcription = ""

        # Write to CSV
        writer.writerow([sample["file_name"], sample["type"], transcription])

print(f"Transcriptions saved to {output_csv}")


[ 0.00000000e+00  3.05175781e-05  3.05175781e-05 ...  0.00000000e+00
 -6.10351562e-05 -6.10351562e-05]




[-3.05175781e-05 -3.05175781e-05 -3.05175781e-05 ...  0.00000000e+00
  0.00000000e+00  0.00000000e+00]
[-2.44140625e-04 -3.05175781e-04 -2.74658203e-04 ... -6.10351562e-05
  0.00000000e+00  0.00000000e+00]


LibsndfileError: Error : unknown error in flac decoder.