In [3]:
import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [55]:
# Define paths and parameters
DATASET_PATH = "/Users/zihuiouyang/Downloads/LA/ASVspoof2019_LA_train/flac"
LABEL_FILE_PATH = "/Users/zihuiouyang/Downloads/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
NUM_CLASSES = 2  # Number of classes (bonafide and spoof)
SAMPLE_RATE = 16000  # Sample rate of your audio files
DURATION = 5  # Duration of audio clips in seconds
N_MELS = 128  # Number of Mel frequency bins

In [29]:
import fnmatch
import os

In [30]:
a = []
for filename in os.listdir('/Users/zihuiouyang/Downloads/cv-corpus-15.0-delta-2023-09-08/en/clips'):
    if fnmatch.fnmatch(filename, '*.flac'):
        a.append(filename)

In [32]:
import random
a1 = random.sample(a,22800)

In [34]:
labels = {}
for i in range(len(a1)):
    file_name = a1[i]
    label = 1
    labels[file_name] = label

In [35]:
labels

{'common_voice_en_38050461.flac': 1,
 'common_voice_en_38320068.flac': 1,
 'common_voice_en_38153365.flac': 1,
 'common_voice_en_38164709.flac': 1,
 'common_voice_en_38311226.flac': 1,
 'common_voice_en_38111813.flac': 1,
 'common_voice_en_38126948.flac': 1,
 'common_voice_en_38035524.flac': 1,
 'common_voice_en_38461130.flac': 1,
 'common_voice_en_38164707.flac': 1,
 'common_voice_en_38050444.flac': 1,
 'common_voice_en_38267103.flac': 1,
 'common_voice_en_38291679.flac': 1,
 'common_voice_en_38308150.flac': 1,
 'common_voice_en_38356661.flac': 1,
 'common_voice_en_38231849.flac': 1,
 'common_voice_en_38237658.flac': 1,
 'common_voice_en_38450948.flac': 1,
 'common_voice_en_38421029.flac': 1,
 'common_voice_en_38270701.flac': 1,
 'common_voice_en_38315397.flac': 1,
 'common_voice_en_38420851.flac': 1,
 'common_voice_en_38052609.flac': 1,
 'common_voice_en_38081170.flac': 1,
 'common_voice_en_38341421.flac': 1,
 'common_voice_en_38251564.flac': 1,
 'common_voice_en_38235167.flac': 1,
 

In [36]:
X = []
y = []

max_time_steps = 109  # Define the maximum time steps for your model

for file_name, label in labels.items():
    file_path = os.path.join("/Users/zihuiouyang/Downloads/cv-corpus-15.0-delta-2023-09-08/en/clips", file_name)

    # Load audio file using librosa
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

    # Extract Mel spectrogram using librosa
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Ensure all spectrograms have the same width (time steps)
    if mel_spectrogram.shape[1] < max_time_steps:
        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, max_time_steps - mel_spectrogram.shape[1])), mode='constant')
    else:
        mel_spectrogram = mel_spectrogram[:, :max_time_steps]

    X.append(mel_spectrogram)
    y.append(label)

In [50]:
labels1 = {}
with open(LABEL_FILE_PATH, 'r') as label_file:
    lines = label_file.readlines()

for line in lines:
    parts = line.strip().split()
    file_name = parts[1]
    if parts[-1] == "bonafide":
        continue
    label = 0
    labels1[file_name] = label

In [56]:
max_time_steps = 109  # Define the maximum time steps for your model

for file_name, label in labels1.items():
    file_path = os.path.join(DATASET_PATH, file_name + ".flac")

    # Load audio file using librosa
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

    # Extract Mel spectrogram using librosa
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Ensure all spectrograms have the same width (time steps)
    if mel_spectrogram.shape[1] < max_time_steps:
        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, max_time_steps - mel_spectrogram.shape[1])), mode='constant')
    else:
        mel_spectrogram = mel_spectrogram[:, :max_time_steps]

    X.append(mel_spectrogram)
    y.append(label)

In [57]:
X = np.array(X)
y = np.array(y)

In [59]:
y_encoded = to_categorical(y, NUM_CLASSES)

In [60]:
split_index = int(0.8 * len(X))

In [61]:
split_index

36480

In [63]:
b = []
for i in range(45600):
    b.append(i)

In [64]:
b1 = random.sample(b,36480)

In [68]:
mask=np.full(len(b),False,dtype=bool)
mask[b1]=True

In [69]:
X_train, X_val = X[mask], X[~mask]
y_train, y_val = y_encoded[mask], y_encoded[~mask]

In [73]:
input_shape = (N_MELS, X_train.shape[2], 1)  # Input shape for CNN (height, width, channels)
model_input = Input(shape=input_shape)

In [74]:
x = Conv2D(32, kernel_size=(3, 3), activation='relu')(model_input)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Conv2D(64, kernel_size=(3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
model_output = Dense(NUM_CLASSES, activation='softmax')(x)

In [75]:
model = Model(inputs=model_input, outputs=model_output)

In [76]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [77]:
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2b7d955e0>

In [78]:
model.save("/Users/zihuiouyang/Documents/audio_classifier.h5")

  saving_api.save_model(


In [79]:
TEST_DATASET_PATH = "./TestEvaluation"
MODEL_PATH = "/Users/zihuiouyang/Documents/audio_classifier.h5"  # Replace with the actual path to your saved model
SAMPLE_RATE = 16000
DURATION = 5
N_MELS = 128
MAX_TIME_STEPS = 109

In [81]:
from tensorflow.keras.models import load_model
model = load_model(MODEL_PATH)

In [82]:
X_test = []

test_files = os.listdir(TEST_DATASET_PATH)
for file_name in test_files:
    file_path = os.path.join(TEST_DATASET_PATH, file_name)

    # Load audio file using librosa
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

    # Extract Mel spectrogram using librosa
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # Ensure all spectrograms have the same width (time steps)
    if mel_spectrogram.shape[1] < MAX_TIME_STEPS:
        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, MAX_TIME_STEPS - mel_spectrogram.shape[1])), mode='constant')
    else:
        mel_spectrogram = mel_spectrogram[:, :MAX_TIME_STEPS]

    X_test.append(mel_spectrogram)

# Convert list to numpy array
X_test = np.array(X_test)

# Predict using the loaded model
y_pred = model.predict(X_test)

# Convert probabilities to predicted classes
y_pred_classes = np.argmax(y_pred, axis=1)

y_pred



array([[1.00000e+00, 0.00000e+00],
       [1.00000e+00, 1.85649e-28],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00],
       [1.00000e+00, 0.00000e+00]], dtype=float32)