**Dataset Pre-processing (ASVspoof Dataset 2019)**

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio as T
import torchaudio.transforms as TT
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import Audio

In [None]:
import pandas as pd

# Load the data onto pandas
df = pd.read_csv(
    "/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt",
    sep=" ", header=None
)
df.columns = ['speaker_id', 'file_id', 'system_id', 'env_id', 'label']

In [None]:
import torchaudio
from tqdm import tqdm

# Path to ASVspoof audio files
asv_path = '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/'

# Compute and store durations (in seconds) and labels for each audio file 
# in the ASVspoof dataset

durations = []
for fname in tqdm(df['file_id']):
    path = asv_path + fname + ".flac"
    info = torchaudio.info(path)
    duration = info.num_frames / info.sample_rate
    label = df[df['file_id'] == fname]['label'].values[0]
    durations.append((fname, duration, label))
    
dur_df = pd.DataFrame(durations, columns=["file_id", "duration", "label"])

In [None]:
# Separate and sort bonafide and spoofed audio files by duration
bonafide_df = dur_df[dur_df.label == 'bonafide'].sort_values(by='duration')
spoofed_df = dur_df[dur_df.label != 'bonafide'].sort_values(by='duration')

In [None]:
# Count bonafide and spoofed audio files with duration between 2.9 and 4 seconds

bonafide_df[(bonafide_df.duration>2.9)&(bonafide_df.duration<4)].count()
spoofed_df[(spoofed_df.duration>2.9)&(spoofed_df.duration<4)].count()

In [None]:
# Get file IDs of bonafide and spoofed audio files longer than 3 seconds
bonafide_cut = bonafide_df[(bonafide_df.duration>3)].file_id.values
spoofed_cut = spoofed_df[(spoofed_df.duration>3)].file_id.values

**Implement AASIST for evaluation**

In [None]:
# Clone the AASIST repository – a deep learning model for detecting spoofed speech 
# using spectro-temporal features

!git clone https://github.com/clovaai/aasist.git

In [None]:
%cd aasist

In [None]:
from models.AASIST import Model 

In [None]:
# Load default AASIST configuration settings from YAML file
import yaml

with open("config/AASIST.conf", "r") as f:
    d_args = yaml.safe_load(f)

In [None]:
# Initialize AASIST model with config and load pre-trained weights for evaluation

model = Model(d_args=d_args['model_config'])
model.load_state_dict(torch.load('models/weights/AASIST.pth', map_location='cpu'))  # path to AASIST2 weights
model.eval()

In [None]:
# # Retrieve the second-to-last file ID from the list of spoofed audio files longer than 3 seconds
spoofed_cut[-2]

In [None]:
# Load the waveform and sample rate of the third spoofed audio file longer than 3 seconds

waveform, sr = torchaudio.load('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/'+spoofed_cut[2]+'.flac')

In [None]:
# Display the shape of the loaded waveform tensor (channels, samples)

waveform.shape

In [None]:
# Run the waveform through AASIST model to get spoof probability (no gradient computation)

with torch.no_grad():
    output = model(waveform)  # (batch, channel, time)
    prob = torch.softmax(output[1][0], dim=-1).cpu()
    print(f"Probability of spoof: {prob[0]}")

In [None]:
# Play the loaded audio waveform at 16 kHz sample rate

Audio(waveform, rate=16000)

**Extract embeddings from raw audio using WavLM**

In [None]:
# Load the waveform and sample rate of the fourth-to-last bonafide audio file longer than 
# 3 seconds

waveform_real, sr = torchaudio.load('/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/'+bonafide_cut[-4]+'.flac')

In [None]:
# Display the shape of the bonafide audio waveform tensor (channels, samples)

waveform_real.shape

In [None]:
# Run the bonafide waveform through AASIST model to get spoof probability (no 
# gradient computation)

with torch.no_grad():
    output_real = model(waveform_real)  # (batch, channel, time)
    prob_real = torch.softmax(output_real[1][0], dim=-1).cpu()
    print(f"Probability of spoof: {prob_real[0]}")

In [None]:
Audio(waveform_real, rate=16000)

**Apply k-NN search to find similar embeddings to be used for the Optimal Transport**

In [None]:
%cd ..

In [None]:
# Clone the kNN-VC repository – a k-nearest neighbors-based voice conversion system

!git clone https://github.com/bshall/knn-vc.git

In [None]:
import sys
import matcher
import knnvc_utils
import hubconf
import prematch_dataset
sys.path.append('/kaggle/working/knn-vc')

In [None]:
# Load the pre-trained WavLM Large model from the hub for extracting speech embeddings

wavlm = hubconf.wavlm_large()

In [None]:

# Load the pre-trained HiFi-GAN vocoder configured for WavLM embeddings for waveform synthesis
hifigan, hifigan_cfg = hubconf.hifigan_wavlm(pretrained=True)

In [None]:
# Generate a weight matrix to emphasize speaker information from WavLM layer 6

SPEAKER_INFORMATION_LAYER = 6
SPEAKER_INFORMATION_WEIGHTS = knnvc_utils.generate_matrix_from_index(SPEAKER_INFORMATION_LAYER)

**Apply Optimal Transport (to generated speech embeddings with natural speech embeddings)**

In [None]:
!pip install pot

In [None]:
# Perform voice conversion on spoofed audio using Optimal Transport and 
# evaluate spoof probabilities with AASIST

score_fake = []
score_bf = []
score_fake_voc = []
score_bf_voc = []
score_fake_ot = []

import ot
for idx in tqdm(range(100)):        
        # Load audio
        src_path = '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/'+spoofed_cut[2+idx]+'.flac'
        target_path = '/kaggle/input/asvpoof-2019-dataset/LA/LA/ASVspoof2019_LA_train/flac/'+bonafide_cut[-2-idx]+'.flac'
        
        # Init model
        worker = matcher.KNeighborsVC(wavlm, hifigan, hifigan_cfg, device='cuda')
        
        # Get embeddings
        src_embeddings = worker.get_features(src_path)
        tgt_embeddings = worker.get_features(target_path)
        
        # Normalize
        src_embeddings_norm = src_embeddings / torch.norm(src_embeddings, dim=1, keepdim=True)
        tgt_embeddings_norm = tgt_embeddings / torch.norm(tgt_embeddings, dim=1, keepdim=True)
        
        # Cosine distance
        cost_matrix = 1 - torch.matmul(src_embeddings_norm, tgt_embeddings_norm.T)
        
        # Uniform transport distributions
        src_distribution = torch.ones(src_embeddings.shape[0]) / src_embeddings.shape[0]
        tgt_distribution = torch.ones(tgt_embeddings.shape[0]) / tgt_embeddings.shape[0]
        
        # Sinkhorn transport plan
        transport_plan = ot.sinkhorn(src_distribution, tgt_distribution, cost_matrix.clone().detach().cpu(), reg=0.1)
        
        # Top-k mapping
        k = 40
        top_k_indices = torch.argsort(transport_plan, dim=1, descending=True)[:, :k]
        
        converted_embs = []
        
        for i in range(src_embeddings.shape[0]):
            knn_indices = top_k_indices[i]
            knn_vectors = tgt_embeddings[knn_indices]
            P = transport_plan[i, knn_indices]
            P = P / P.sum()
            converted_emb = torch.sum(P.unsqueeze(1) * knn_vectors.cpu(), dim=0)
            converted_embs.append(converted_emb)
        
        # Stack and vocode
        transformed_embeddings = torch.stack(converted_embs).unsqueeze(0).to('cuda')
        converted_audio = worker.vocode(transformed_embeddings).cpu()

        audio_fake, sr = torchaudio.load(src_path)
        audio_bf, sr = torchaudio.load(target_path)
        audio_fake_voc = worker.vocode(src_embeddings.unsqueeze(0)).cpu()
        audio_bf_voc = worker.vocode(tgt_embeddings.unsqueeze(0)).cpu()

        with torch.no_grad():
            output = model(converted_audio.cpu())  # (batch, channel, time)
            prob = torch.softmax(output[1][0], dim=-1).cpu()
            score_fake_ot.append(prob[0])
            #---
            output = model(audio_fake)  # (batch, channel, time)
            prob = torch.softmax(output[1][0], dim=-1).cpu()
            score_fake.append(prob[0])
            #---
            output = model(audio_bf)  # (batch, channel, time)
            prob = torch.softmax(output[1][0], dim=-1).cpu()
            score_bf.append(prob[0])
            #---
            output = model(audio_fake_voc)  # (batch, channel, time)
            prob = torch.softmax(output[1][0], dim=-1).cpu()
            score_fake_voc.append(prob[0])
            #---
            output = model(audio_bf_voc)  # (batch, channel, time)
            prob = torch.softmax(output[1][0], dim=-1).cpu()
            score_bf_voc.append(prob[0])

**Plot the results**

In [None]:
import seaborn as sns

In [None]:
# Plot a boxplot comparing spoof probabilities for different audio types 
# (original, vocoded, and OT-converted)

plt.figure(figsize=(10,6))
sns.boxplot(data=[score_fake, score_bf, score_fake_voc, score_bf_voc, score_fake_ot])
plt.xticks([0,1,2,3,4], ['Fake', 'BF', 'Fake Vocoded', 'BF Vocoded', 'Fake OT'])
plt.title('Boxplot of Scores')
plt.show()

In [None]:
# Compute mean and 95% confidence intervals for spoof scores of each audio type and 
# display the results

import scipy.stats as stats

# Suppose you have your arrays:
arrays = [score_fake, score_bf, score_fake_voc, score_bf_voc, score_fake_ot]
names = ['Fake', 'BF', 'Fake Vocoded', 'BF Vocoded', 'Fake OT']

means = []
cis_lower = []
cis_upper = []

for arr in arrays:
    mean = np.mean(arr)
    n = len(arr)
    std_err = stats.sem(arr)  # standard error of the mean
    # 95% confidence interval
    ci = stats.t.interval(0.95, df=n-1, loc=mean, scale=std_err)
    
    means.append(mean)
    cis_lower.append(ci[0])
    cis_upper.append(ci[1])

# Display
for name, mean, lower, upper in zip(names, means, cis_lower, cis_upper):
    print(f"{name}: mean = {mean:.4f}, 95% CI = ({lower:.4f}, {upper:.4f})")


In [None]:
plt.figure(figsize=(8,6))
plt.bar(names, means, yerr=[np.array(means) - np.array(cis_lower), np.array(cis_upper) - np.array(means)], capsize=5)
plt.ylabel('AASIST Speechfake Detection Score')
plt.title('Means with 95% Confidence Intervals')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
# Display the shape of the converted audio waveform generated after OT-based embedding 
# transformation

converted_audio.shape

**Check the AASIST Score for Spoof Probability**

In [None]:
# Evaluate spoof probability of the OT-converted audio using AASIST (without gradient computation)
with torch.no_grad():
    output = model(converted_audio.cpu())  # (batch, channel, time)
    prob = torch.softmax(output[1][0], dim=-1).cpu()
    print(f"Probability of spoof: {prob[0]}")

In [None]:
# Play the OT-converted audio waveform at 16 kHz sample rate
Audio(converted_audio, rate=16000)