In [None]:
import os
import numpy as np
import librosa
import concurrent.futures

# Define the extract_logmel function
import numpy as np
import librosa

# Fungsi untuk padding atau truncating fitur
def pad_or_truncate(feature, max_frames):
    """
    Padding atau truncating fitur untuk memastikan panjang yang konsisten.
    """
    # Jika panjang fitur kurang dari max_frames, lakukan padding
    if feature.shape[1] < max_frames:
        pad_width = max_frames - feature.shape[1]
        feature = np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
    # Jika panjang fitur lebih dari max_frames, lakukan truncating
    elif feature.shape[1] > max_frames:
        feature = feature[:, :max_frames]
    return feature

# Fungsi ekstraksi MFCC
def extract_mfcc_40(audio_path, n_mfcc = 40, duration = 3, max_frames = 100):
    """
    Ekstraksi MFCC dengan padding atau truncating.
    """
    y, sr = librosa.load(audio_path, duration=duration, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=16000, n_mfcc=n_mfcc, n_fft=2048, hop_length=512)
    
    # Lakukan padding atau truncating
    mfcc = pad_or_truncate(mfcc, max_frames)
    
    return mfcc

def load_protocol_file(protocol_file):
    labels = {}
    with open(protocol_file, 'r') as file:
        lines = file.readlines()
    for line in lines:
        parts = line.strip().split()
        file_name = parts[1]
        label = 1 if parts[-1] == "bonafide" else 0
        labels[file_name] = label
    return labels


def load_protocol_file_com(protocol_file, categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19']):
    labels = {}
    with open(protocol_file, 'r') as file:
        lines = file.readlines()
    
    for line in lines:
        parts = line.strip().split()
        file_name = parts[1]
        
        # Ambil jika mengandung salah satu kategori dalam nama file atau jika label "bonafide"
        if any(category in line for category in categories) or parts[-1] == "bonafide":
            label = 1 if parts[-1] == "bonafide" else 0
            labels[file_name] = label
    
    return labels
    
# Define the process_file function
def process_file(file_name, label, audio_dir, feature_extractor):
    """
    Process a single audio file to extract features using the given feature extractor.
    """
    audio_file = os.path.join(audio_dir, file_name + '.flac')
    if os.path.exists(audio_file):
        features = feature_extractor(audio_file)
        return features, label
    return None, None

# Update the load_audio_data_parallel function
def load_audio_data_parallel(labels, audio_dir, feature_extractor):
    """
    Load audio data in parallel, extracting features using the specified feature extractor.
    """
    X = []
    y = []

    # Use ThreadPoolExecutor to process files in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for file_name, label in labels.items():
            futures.append(executor.submit(process_file, file_name, label, audio_dir, feature_extractor))

        # Collect results
        for future in concurrent.futures.as_completed(futures):
            features, label = future.result()
            if features is not None:
                X.append(features)
                y.append(label)

    # Convert lists to numpy arrays
    X = np.array(X, dtype=object)
    y = np.array(y)

    # Ensure that X is a 2D array (samples, features)
    if X.ndim == 1:
        X = np.vstack(X)

    return X, y

# # Buat tau flacnya itu spoof atau bonafide
train_labels_wo_swf = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])
dev_labels_wo_swf = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])
eval_labels_wo_swf = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])

# # Melakukan X->flacnya dilakuin log mel spectogram, y nya label 1, 0
X_train_mfcc_wo_swf, y_train_mfcc_wo_swf = load_audio_data_parallel(train_labels_wo_swf, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac', extract_mfcc_40)
X_dev_mfcc_wo_swf, y_dev_mfcc_wo_swf = load_audio_data_parallel(dev_labels_wo_swf, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_dev/flac', extract_mfcc_40)
X_eval_mfcc_wo_swf, y_eval_mfcc_wo_swf = load_audio_data_parallel(eval_labels_wo_swf, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval/flac', extract_mfcc_40)

import pickle
def save_data(file_name, data):
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)

# Menyimpan data X_train dan y_train
save_data('X_train_mfcc_wo_wf_13.pkl', X_train_mfcc_wo_swf)
save_data('y_train_mfcc_wo_wf_13.pkl', y_train_mfcc_wo_swf)
save_data('X_dev_mfcc_wo_wf_13.pkl', X_dev_mfcc_wo_swf)
save_data('y_dev_mfcc_wo_wf_13.pkl', y_dev_mfcc_wo_swf)
save_data('X_eval_mfcc_wo_wf_13.pkl', X_eval_mfcc_wo_swf)
save_data('y_eval_mfcc_wo_wf_13.pkl', y_eval_mfcc_wo_swf)

In [None]:
# # Buat tau flacnya itu spoof atau bonafide
train_labels = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])
dev_labels = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])
eval_labels = load_protocol_file_com('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt', categories=['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16','A18','A19'])

In [None]:
# # Melakukan X->flacnya dilakuin log mel spectogram, y nya label 1, 0
X_train_mfcc, y_train_mfcc = load_audio_data_parallel(train_labels, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac', extract_mfcc_40)
X_dev_mfcc, y_dev_mfcc = load_audio_data_parallel(dev_labels, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_dev/flac', extract_mfcc_40)
X_eval_mfcc, y_eval_mfcc = load_audio_data_parallel(eval_labels, '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_eval/flac', extract_mfcc_40)

In [None]:
import pickle
def save_data(file_name, data):
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)

# Menyimpan data X_train dan y_train - ..._wo_(filter_type)
save_data('X_train_mfcc_wo_wf_40.pkl', X_train_mfcc)
save_data('y_train_mfcc_wo_wf_40.pkl', y_train_mfcc)
save_data('X_dev_mfcc_wo_wf_40.pkl', X_dev_mfcc)
save_data('y_dev_mfcc_wo_wf_40.pkl', y_dev_mfcc)
save_data('X_eval_mfcc_wo_wf_40.pkl', X_eval_mfcc)
save_data('y_eval_mfcc_wo_wf_40.pkl', y_eval_mfcc_)