In [2]:
import random
import logging
import tensorflow as tf
from tensorflow import keras
from keras import layers, models, Model
from sklearn.metrics import f1_score, recall_score, accuracy_score
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import pickle
import os
import librosa
from sklearn.model_selection import train_test_split
from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor, TFWav2Vec2Model, Wav2Vec2Processor

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)
# Set random seed
tf.keras.utils.set_random_seed(42)

### "Basic configuration"

In [7]:
# Maximum duration of the input audio file we feed to our Wav2Vec 2.0 model.
MAX_DURATION = 5  # Change to 30 seconds

# Sampling rate is the number of samples of audio recorded every second
SAMPLING_RATE = 16000
BATCH_SIZE = 8  # Batch-size for training and evaluating our model.
NUM_CLASSES = 2  # Number of classes our dataset will have (2 in our case).
HIDDEN_DIM = 768  # Dimension of our model output (768 in case of Wav2Vec 2.0 - Base).
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE  # Maximum length of the input audio file.

# Wav2Vec 2.0 results in an output frequency with a stride of about 20ms.
MAX_FRAMES = 249 # Adjust for 30 seconds
MAX_EPOCHS = 10  # Maximum number of training epochs.
SEED = 42
MODEL_CHECKPOINT = "facebook/pav2e-base"  # Name of pretrained model from Hugging Face Model Hu

FAKE = "KAGGLE/AUDIO/FAKE"
REAL = "KAGGLE/AUDIO/REAL"

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/pav2e-base")  #retrieve feature extractor

### "Load files"

In [8]:
def load_audio_files_and_labels(fake_folder, real_folder):
    audio_data = []
    labels = []

    # Load fake audio files
    for filename in os.listdir(fake_folder):
        file_path = os.path.join(fake_folder, filename)
        audio, sr = librosa.load(file_path, sr=16000, duration=MAX_DURATION)  # Load 30 seconds
        audio_data.append(audio)
        labels.append(1)

    # Load real audio files
    for filename in os.listdir(real_folder):
        file_path = os.path.join(real_folder, filename)
        audio, sr = librosa.load(file_path, sr=16000, duration=MAX_DURATION)  # Load 30 seconds
        audio_data.append(audio)
        labels.append(0)

    return audio_data, np.array(labels)

def extract_features(audio_data):
    features = []
    for audio in audio_data:
        inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="np", padding=True, truncation=True, max_length=MAX_SEQ_LENGTH)
        features.append(inputs.input_values)
    return np.concatenate(features, axis=0)

In [9]:
audio_data, labels = load_audio_files_and_labels(FAKE, REAL)
features = extract_features(audio_data)
print(features.shape)

(64, 80000)


### "Create training and testing batch"

In [10]:
X_tr, X_te, y_tr, y_te = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=SEED)

### "Create custom Model using TFWav2Vec2"

In [11]:

def mean_pool(hidden_states, feature_lengths, batch_size):
    attenion_mask = tf.sequence_mask(
        feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
    )
    padding_mask = tf.cast(
        tf.reverse(tf.cumsum(tf.reverse(attenion_mask, [-1]), -1), [-1]),
        dtype=tf.dtypes.bool,
    )
    hidden_states = tf.where(
        tf.broadcast_to(
            tf.expand_dims(~padding_mask, -1), (batch_size, MAX_FRAMES, HIDDEN_DIM)
        ),
        0.0,
        hidden_states,
    )
    pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape(
        tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
        [-1, 1],
    )
    return pooled_state

class Wav2Vec2_Model(layers.Layer):

    def __init__(self, model_checkpoint, num_classes):
        super().__init__()
        self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
            model_checkpoint, apply_spec_augment=False, from_pt=True
        )
        self.pooling = layers.GlobalAveragePooling1D()
        self.intermediate_layer_dropout = layers.Dropout(0.5)
        self.final_layer = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        hidden_states = self.wav2vec2(inputs["input_values"])[0]
        batch_size = tf.shape(hidden_states)[0]

        if tf.is_tensor(inputs["attention_mask"]):
            audio_lengths = tf.cumsum(inputs["attention_mask"], -1)[:, -1]
            feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
                audio_lengths
            )
            pooled_state = mean_pool(hidden_states, feature_lengths, batch_size)
        else:
            pooled_state = self.pooling(hidden_states)

        intermediate_state = self.intermediate_layer_dropout(pooled_state)
        final_state = self.final_layer(intermediate_state)

        return final_state

# Rebuild the model to apply the mixed precision policy
def build_model():
    inputs = {
        "input_values": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="float32"),
        "attention_mask": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="int32"),
    }
    wav2vec2_model = Wav2Vec2_Model(MODEL_CHECKPOINT, NUM_CLASSES)(
        inputs
    )
    model = tf.keras.Model(inputs, wav2vec2_model)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    return model

model = build_model()


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_hid.weight', 'quantizer.codevectors', 'project_hid.bias', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFWav2Vec2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the che

In [12]:
inputs = tf.data.Dataset.from_tensor_slices(({"input_values": X_tr, "attention_mask": np.ones_like(X_tr)}, y_tr)).batch(BATCH_SIZE)
val_data = tf.data.Dataset.from_tensor_slices(({"input_values": X_te, "attention_mask": np.ones_like(X_te)}, y_te)).batch(BATCH_SIZE)

# Training
history = model.fit(
    inputs,
    validation_data=val_data,
    batch_size=BATCH_SIZE,
    epochs=MAX_EPOCHS
)

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2s/step - accuracy: 0.5688 - loss: 0.6786 - val_accuracy: 0.8462 - val_loss: 0.5331
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.6345 - loss: 0.6361 - val_accuracy: 0.8462 - val_loss: 0.5299
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.7009 - loss: 0.6239 - val_accuracy: 0.8462 - val_loss: 0.5269
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.8719 - loss: 0.5341 - val_accuracy: 0.8462 - val_loss: 0.5239
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0.6161 - loss: 0.6370 - val_accuracy: 0.8462 - val_loss: 0.5209
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.6183 - loss: 0.6442 - val_accuracy: 0.8462 - val_loss: 0.5179
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

### "Measure model"

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Get training accuracy list
training_accuracy = history.history['accuracy']
print("Training Accuracy List: ", training_accuracy)


# Get predictions
y_pred_probs = model.predict(
    {"input_values": X_te, "attention_mask": np.ones_like(X_te)},
    batch_size=BATCH_SIZE
)

# Convert predicted probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_te, y_pred)

# Calculate precision, recall, and F1-score
precision = precision_score(y_te, y_pred, average='binary')
recall = recall_score(y_te, y_pred, average='binary')
f1 = f1_score(y_te, y_pred, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Training Accuracy List:  [0.6274510025978088, 0.7058823704719543, 0.7254902124404907, 0.8627451062202454, 0.686274528503418, 0.6274510025978088, 0.6470588445663452, 0.7058823704719543, 0.843137264251709, 0.7450980544090271]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3s/step
Accuracy: 0.8461538461538461
Precision: 0.9090909090909091
Recall: 0.9090909090909091
F1-score: 0.9090909090909091
