In [None]:
import os
import numpy as np
import librosa
import concurrent.futures
import pandas as pd

# Function for padding or truncating
def pad_or_truncate(feature, max_frames):
    if feature.shape[1] < max_frames:
        pad_width = max_frames - feature.shape[1]
        feature = np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
    elif feature.shape[1] > max_frames:
        feature = feature[:, :max_frames]
    return feature

# Extract MFCC features
def extract_mfcc_40(audio_path, n_mfcc=40, duration=3, max_frames=100):
    y, sr = librosa.load(audio_path, duration=duration, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=16000, n_mfcc=n_mfcc, n_fft=2048, hop_length=512)
    return pad_or_truncate(mfcc, max_frames)

# Load protocol file and categorize by bonafide or system ID
def load_protocol_file_combined(protocol_file, categories):
    labels = {}
    with open(protocol_file, 'r') as file:
        for line in file:
            parts = line.strip().split()
            file_name, label = parts[1], 1 if parts[-1] == "bonafide" else 0
            system_id = next((cat for cat in categories if cat in line), "bonafide" if label == 1 else "unknown")
            labels[file_name] = (label, system_id)
    return labels

# Process each file to extract features
def process_file(file_name, label, system_id, audio_dir):
    audio_file = os.path.join(audio_dir, file_name + '.flac')
    if os.path.exists(audio_file):
        features = extract_mfcc_40(audio_file)
        return file_name, label, system_id, features.flatten().tolist()
    return None

# Parallel processing of audio files
def load_audio_data_parallel(labels, audio_dir):
    data = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_file, file, label, sys_id, audio_dir): file for file, (label, sys_id) in labels.items()}
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                data.append(result)
    return data

# Define dataset paths
categories = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10',
              'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A17', 'A18', 'A19']
protocol_files = {
    "train": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt",
    "dev": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt",
    "eval": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt"
}
audio_dirs = {
    "train": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac",
    "dev": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_dev/flac",
    "eval": "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval/flac"
}

# Merge all labels
all_labels = {}
for key in protocol_files:
    all_labels.update(load_protocol_file_combined(protocol_files[key], categories))

# Extract features
all_data = []
for key in audio_dirs:
    all_data.extend(load_audio_data_parallel(all_labels, audio_dirs[key]))

# Convert to DataFrame
df = pd.DataFrame(all_data, columns=["file_name", "system_id", "features", "label"])

# Save bonafide and each system ID separately
df[df["label"] == 1].to_csv("bonafide.csv", index=False)
for system_id in categories:
    df[df["system_id"] == system_id].to_csv(f"{system_id}.csv", index=False)


In [None]:
print("hello_world")