In [1]:
#Install Dependencies
# ! pip install datasets librosa scikit-learn tensorflow joblib


In [2]:
#Import Libraries
import os
import librosa
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset
# Load the real dataset from Hugging Face
ds = load_dataset("CSALT/deepfake_detection_dataset_urdu")
audio_data = ds['train']

# Check one sample to understand structure (contains audio array, sampling_rate, and path)
print("Sample example structure:")
print(audio_data[0])

Repo card metadata block was not found. Setting CardData to empty.


Sample example structure:
{'audio': {'path': 'C:\\Users\\hp\\.cache\\huggingface\\hub\\datasets--CSALT--deepfake_detection_dataset_urdu\\snapshots\\eb8f16623108324867e4424a646937409c64b82b\\Bonafide\\Speaker_01\\Part 1\\10.wav', 'array': array([ 0.00000000e+00,  0.00000000e+00, -3.05175781e-05, ...,
        7.32421875e-04,  1.22070312e-03,  1.28173828e-03]), 'sampling_rate': 16000}}


In [4]:
# Extract MFCC features from audio
def extract_mfcc(audio_data):
    y = audio_data['array']
    sr = audio_data['sampling_rate']
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)

# Extract features and actual labels from dataset
features = []
labels = []

for i, example in enumerate(audio_data):
    try:
        mfcc = extract_mfcc(example['audio'])
        features.append(mfcc)

        # ✅ Now using real labels from the file path
        file_path = example['audio']['path'].lower()

        if 'bonafide' in file_path:
            label = 'bonafide'
        elif 'deepfake' in file_path:
            label = 'deepfake'
        else:
            print(f"Unknown label in path: {file_path}")
            continue  # Skip this file if label not found

        labels.append(label)

    except Exception as e:
        print(f"Error processing example {i}: {e}")

# Convert to NumPy arrays
X = np.array(features)
y = np.array(labels)


In [5]:
# Encode Labels and Train-Test Split
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # bonafide -> 0, deepfake -> 1
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, "scaler5.joblib")  # Save scaler

['scaler5.joblib']

In [6]:
# Train SVM
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Report:")
print(classification_report(y_test, y_pred_svm))
print("ROC AUC:", roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1]))
joblib.dump(svm_model, "svm_model5.joblib")

SVM Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       681
           1       0.97      0.98      0.97       678

    accuracy                           0.97      1359
   macro avg       0.97      0.97      0.97      1359
weighted avg       0.97      0.97      0.97      1359

ROC AUC: 0.9936779592738424


['svm_model5.joblib']

In [7]:
# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Report:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1]))
joblib.dump(lr_model, "logistic_model5.joblib")


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.81      0.78      0.80       681
           1       0.79      0.82      0.80       678

    accuracy                           0.80      1359
   macro avg       0.80      0.80      0.80      1359
weighted avg       0.80      0.80      0.80      1359

ROC AUC: 0.8828549027761534


['logistic_model5.joblib']

In [8]:
# Train Single-Layer Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
y_pred_perc = perceptron.predict(X_test)
print("Perceptron Report:")
print(classification_report(y_test, y_pred_perc))

# Note: Perceptron does not support probability prediction directly
# so we skip AUC or use workaround if needed

Perceptron Report:
              precision    recall  f1-score   support

           0       0.79      0.63      0.70       681
           1       0.69      0.84      0.76       678

    accuracy                           0.73      1359
   macro avg       0.74      0.73      0.73      1359
weighted avg       0.74      0.73      0.73      1359



In [9]:
# Train Deep Neural Network
dnn_model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
dnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dnn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
y_pred_dnn = dnn_model.predict(X_test).ravel()
y_pred_dnn_labels = (y_pred_dnn > 0.5).astype(int)
print("DNN Report:")
print(classification_report(y_test, y_pred_dnn_labels))
print("ROC AUC:", roc_auc_score(y_test, y_pred_dnn))
dnn_model.save("dnn_model5.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
DNN Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       681
           1       0.98      0.98      0.98       678

    accuracy                           0.98      1359
   macro avg       0.98      0.98      0.98      1359
weighted avg       0.98      0.98      0.98      1359

ROC AUC: 0.9948496701449803


In [10]:
# Summary Table
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model_name, y_true, y_pred, y_proba=None):
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_proba) if y_proba is not None else "N/A"
    }

results = [
    evaluate("SVM", y_test, y_pred_svm, svm_model.predict_proba(X_test)[:, 1]),
    evaluate("Logistic Regression", y_test, y_pred_lr, lr_model.predict_proba(X_test)[:, 1]),
    evaluate("Perceptron", y_test, y_pred_perc),
    evaluate("DNN", y_test, y_pred_dnn_labels, y_pred_dnn)
]

summary_df = pd.DataFrame(results)
print(summary_df)
summary_df.to_csv("model_summary.csv", index=False)  # Save summary to CSV


                 Model  Accuracy  Precision    Recall        F1       AUC
0                  SVM  0.974246   0.966618  0.982301  0.974396  0.993678
1  Logistic Regression  0.799853   0.788352  0.818584  0.803184  0.882855
2           Perceptron  0.731420   0.690158  0.837758  0.756829       N/A
3                  DNN  0.983076   0.983752  0.982301  0.983026   0.99485
