# Classification between real and deepfake audio
### We are using FoR-2sec dataset for training and testing 
#### Loading the Data:
- Load the dataset given in the repository
- Convert the audio files to respective MFCC to further use in models as an array form

In [14]:
import os
import librosa
import numpy as np

# Set dataset path
dataset_path = r"E:\for-2sec\for-2seconds\training" # Here You must change the path to wherever you saved the datafile

# Define MFCC parameters
n_mfcc = 13
sample_rate = 16000
hop_length = 320  # Adjusted for 2 sec audio
n_fft = 1024  # Typical FFT window size

# Prepare storage lists
X = []
y = []

# Class mapping
label_map = {"real": 0, "fake": 1}

# Load MFCCs from dataset
for label in ["real", "fake"]:
    folder_path = os.path.join(dataset_path, label)

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Load audio file
        y_audio, sr = librosa.load(file_path, sr=sample_rate)

        # Extract MFCC with controlled hop length
        mfccs = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
        mfccs = mfccs.T  # Shape: (time_steps, n_mfcc)

        # Append to dataset lists
        X.append(mfccs)
        y.append(label_map[label])

# Convert to NumPy arrays
X = np.array(X)  # Shape: (num_samples, 100, 13)
y = np.array(y)

from sklearn.utils import shuffle

# Shuffle dataset (X = MFCC features, y = Labels)
X, y = shuffle(X, y, random_state=42)
X, y = shuffle(X, y, random_state=42)
X, y = shuffle(X, y, random_state=42)

# Save to disk
# np.save("mfcc_features.npy", X)
# np.save("labels.npy", y)

# Print shapes
print(f"X shape: {X.shape}")  # Expected: (num_samples, 100, 13)
print(f"y shape: {y.shape}")


X shape: (13956, 101, 13)
y shape: (13956,)


### Model 1: CNN + LSTM

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout

model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X.shape[1], X.shape[2])),
    MaxPooling1D(pool_size=2),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

from sklearn.model_selection import train_test_split

# Split into training (80%) and testing (20%) sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train1.shape}, Labels: {y_train1.shape}")
print(f"Testing data shape: {X_test1.shape}, Labels: {y_test1.shape}")

# Train the model
history = model.fit(X_train1, y_train1, epochs=20, batch_size=32)
print('Training done successfully')

# Evaluate on test data
loss, accuracy = model.evaluate(X_test1, y_test1)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4%}")

model.save("deepfake_audio_detector_v3.keras")
print("Model saved successfully!")


Training data shape: (11164, 101, 13), Labels: (11164,)
Testing data shape: (2792, 101, 13), Labels: (2792,)
Epoch 1/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.7003 - loss: 0.5568
Epoch 2/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9184 - loss: 0.2297
Epoch 3/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9427 - loss: 0.1612
Epoch 4/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.9611 - loss: 0.1059
Epoch 5/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9776 - loss: 0.0710
Epoch 6/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9755 - loss: 0.0760
Epoch 7/20
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - accuracy: 0.9849 - loss: 0.0434
Epoch 8/20
[1m349/349[0m [32m━━━━━━━━━━━

### Model 2: QSVM

In [17]:
X_flat = X.reshape(X.shape[0], -1)  # (samples, time_steps * n_mfcc)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_flat)


from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_flat, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create QSVM with polynomial kernel of degree 2 (quadratic)
qsvm = SVC(kernel='poly', degree=2)

# Train
qsvm.fit(X_train2, y_train2)

# Predict
y_pred2 = qsvm.predict(X_test2)

# Evaluate
print("Accuracy:", accuracy_score(y_test2, y_pred2)*100,'%')
print("\nClassification Report:\n", classification_report(y_test2, y_pred2))

Accuracy: 82.41404011461319 %

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.79      0.82      1396
           1       0.80      0.86      0.83      1396

    accuracy                           0.82      2792
   macro avg       0.83      0.82      0.82      2792
weighted avg       0.83      0.82      0.82      2792



### Model 3: Deepsonar

In [18]:
import tensorflow as tf

# One-hot encode labels
num_classes = len(np.unique(y))
y3 = tf.keras.utils.to_categorical(y, num_classes)

# Train-test split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y3, test_size=0.2, random_state=42)

# Expand dims to add channel for CNN: (samples, time_steps, n_mfcc, 1)
X_train3 = X_train3[..., np.newaxis]
X_test3 = X_test3[..., np.newaxis]

# print(X_train.shape)


# DeepSonar-Inspired Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=X_train3.shape[1:]),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Reshape((-1, 64)),  # (time_steps/4, 64) to feed LSTM
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train
history = model.fit(X_train3, y_train3, epochs=10, batch_size=32)

# Evaluate
loss, acc = model.evaluate(X_test3, y_test3)
print(f"\n✅ Test Accuracy: {acc * 100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.6812 - loss: 0.5729
Epoch 2/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.8839 - loss: 0.2994
Epoch 3/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9055 - loss: 0.2432
Epoch 4/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9339 - loss: 0.1774
Epoch 5/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9401 - loss: 0.1623
Epoch 6/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9533 - loss: 0.1207
Epoch 7/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9629 - loss: 0.1008
Epoch 8/10
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9706 - loss: 0.0805
Epoch 9/10
[1m349/349[0m [32

### Bonus Model: CNN training with MFCC plots

In [19]:
import matplotlib.pyplot as plt
import librosa.display

# Output folder to save plots
output_dir = r"E:\for-2sec\images1"

# Loop through all samples and save as images
for i in range(len(X)):
    mfcc = X[i].T  # Transpose for (n_mfcc, time_steps), suitable for plotting

    # Create plot
    plt.figure(figsize=(2, 2))
    librosa.display.specshow(mfcc, cmap="viridis", x_axis='time')
    plt.axis('off')


    # Save figure
    save_path = os.path.join(output_dir, f"mfcc_{i}.png")
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()

print(f"All MFCC images saved to: {output_dir}")

All MFCC images saved to: E:\for-2sec\images1


In [21]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split

# Paths
image_folder = r"E:\for-2sec\images1"
# labels_path = "labels.npy"
img_height, img_width = 128, 128

# Load images
X4 = []
for i in range(11547):
    img_path = os.path.join(image_folder, f"mfcc_{i}.png")
    img = load_img(img_path, target_size=(img_height, img_width))
    img_array = img_to_array(img) / 255.0  # Normalize
    X4.append(img_array)

X4 = np.array(X4)

# Load labels
y4 = y [:len(X4)] #Was facing an error for different sizes of X4 and y4
num_classes = len(np.unique(y4))


# One-hot encode after shuffling
y4 = tf.keras.utils.to_categorical(y4, num_classes)

# Split into training and testing
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.2, random_state=42)

# CNN Model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(img_height, img_width, 3)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(X_train4, y_train4, epochs=5, batch_size=32)

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test4, y_test4)
print("Accuracy:", test_accuracy*100,'%')

Epoch 1/5
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 169ms/step - accuracy: 0.6236 - loss: 0.6689
Epoch 2/5
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 160ms/step - accuracy: 0.7676 - loss: 0.5154
Epoch 3/5
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 160ms/step - accuracy: 0.7935 - loss: 0.4653
Epoch 4/5
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 161ms/step - accuracy: 0.8153 - loss: 0.4194
Epoch 5/5
[1m289/289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 163ms/step - accuracy: 0.8344 - loss: 0.3851
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.8378 - loss: 0.3553
Accuracy: 83.46320390701294 %
