In [2]:
# Cell 1: Introduction and Imports

# Deepfake Detection Evaluation Notebook
# Using calibrated thresholds from ASVspoof2021 DF SSL_Anti-spoofing model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import pickle
from pathlib import Path
import json

# Import functions from our eval_metrics.py
sys.path.append('.')
from eval_metrics import compute_metrics

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_context("notebook", font_scale=1.2)
plt.rcParams['figure.figsize'] = (12, 8)

In [3]:
# Cell 2: Define the calibrated thresholds from previous analysis

CALIBRATED_THRESHOLDS = {
    'eer_threshold': -3.5324,  
    'min_dcf_threshold': -1.4866
}

print("Using calibrated thresholds from ASVspoof2021 DF SSL_Anti-spoofing model:")
print(f"EER threshold: {CALIBRATED_THRESHOLDS['eer_threshold']:.4f}")
print(f"minDCF threshold: {CALIBRATED_THRESHOLDS['min_dcf_threshold']:.4f}")

Using calibrated thresholds from ASVspoof2021 DF SSL_Anti-spoofing model:
EER threshold: -3.5324
minDCF threshold: -1.4866


In [6]:
# Cell 3: Load Scores from File

def load_scores_from_file(scores_file, labels_file=None, label_type=None):
    """
    Load detection scores from pickle file and optionally load or assign labels.
    
    Args:
        scores_file: Path to pickle file containing scores
        labels_file: Optional path to file containing labels
        label_type: If no labels_file, assign this label to all files ('bonafide' or 'spoof')
        
    Returns:
        DataFrame with file IDs, scores, and labels
    """
    print(f"Loading scores from {scores_file}...")
    
    try:
        # Load scores from pickle file
        with open(scores_file, 'rb') as f:
            scores_data = pickle.load(f)
        
        # Convert the loaded data to a DataFrame
        # This assumes the pickle file contains either:
        # 1. A dictionary with file IDs as keys and scores as values
        # 2. A pandas DataFrame with at least file ID and score columns
        # 3. A list of tuples/lists with (file_id, score)
        
        if isinstance(scores_data, dict):
            results_df = pd.DataFrame({
                'file_id': list(scores_data.keys()),
                'score': list(scores_data.values())
            })
        elif isinstance(scores_data, pd.DataFrame):
            # Ensure the DataFrame has required columns
            required_cols = ['file_id', 'score']
            if not all(col in scores_data.columns for col in required_cols):
                # Try to find suitable columns or rename existing ones
                if 'filename' in scores_data.columns:
                    scores_data['file_id'] = scores_data['filename']
                if 'prediction' in scores_data.columns:
                    scores_data['score'] = scores_data['prediction']
                
            results_df = scores_data[['file_id', 'score']].copy()
        elif isinstance(scores_data, list):
            # Assuming list of tuples/lists with (file_id, score)
            results_df = pd.DataFrame(scores_data, columns=['file_id', 'score'])
        else:
            raise ValueError(f"Unsupported scores data format: {type(scores_data)}")
        
        # Handle labels
        if labels_file:
            print(f"Loading labels from {labels_file}...")
            # Load labels from file (assume it's a pickled dict or DataFrame)
            with open(labels_file, 'rb') as f:
                labels_data = pickle.load(f)
            
            if isinstance(labels_data, dict):
                # Convert dict to Series for easier merging
                labels_series = pd.Series(labels_data, name='expected_label')
                labels_df = pd.DataFrame({'file_id': labels_series.index, 'expected_label': labels_series.values})
                
                # Merge with scores DataFrame
                results_df = pd.merge(results_df, labels_df, on='file_id', how='left')
            elif isinstance(labels_data, pd.DataFrame):
                # Assuming DataFrame has file_id and label columns
                if 'file_id' in labels_data.columns and 'label' in labels_data.columns:
                    results_df = pd.merge(results_df, 
                                          labels_data[['file_id', 'label']], 
                                          on='file_id', how='left')
                    results_df['expected_label'] = results_df['label']
                    results_df.drop('label', axis=1, inplace=True, errors='ignore')
            else:
                raise ValueError(f"Unsupported labels data format: {type(labels_data)}")
        elif label_type:
            print(f"Assigning '{label_type}' label to all files...")
            # Assign the same label to all files
            if label_type.lower() in ['bonafide', 'genuine', 'real', '1']:
                results_df['expected_label'] = 1  # Bonafide
            elif label_type.lower() in ['spoof', 'fake', 'deepfake', '0']:
                results_df['expected_label'] = 0  # Spoof
            else:
                raise ValueError(f"Unsupported label type: {label_type}. Use 'bonafide' or 'spoof'")
        else:
            # No labels provided
            print("No labels provided. Analysis will be limited.")
            results_df['expected_label'] = np.nan
        
        # Compute detection results based on thresholds
        results_df['detected_as_fake_eer'] = results_df['score'] < CALIBRATED_THRESHOLDS['eer_threshold']
        results_df['detected_as_fake_dcf'] = results_df['score'] < CALIBRATED_THRESHOLDS['min_dcf_threshold']
        
        print(f"Loaded {len(results_df)} scores.")
        return results_df
    
    except Exception as e:
        print(f"Error loading scores: {e}")
        return pd.DataFrame()

# Load scores from the pickle file
# You can either:
# 1. Just load scores and assign all as spoof: label_type='spoof'
# 2. Load scores and a separate labels file: labels_file='path/to/labels.pkl'
# 3. Just load scores without labels (limited analysis): labels_file=None, label_type=None

SCORES_FILE = "../files/sls_scores.pkl"
# LABELS_FILE = None  # Optional path to labels file
LABEL_TYPE = "bonafide"  # Or "bonafide", or None if using LABELS_FILE

results_df = load_scores_from_file(SCORES_FILE, label_type=LABEL_TYPE)

# Display the first few results
if not results_df.empty:
    display(results_df.head())
    print(f"Processed {len(results_df)} files")
else:
    print("No results to display. Please check the scores file path.")

Loading scores from ../files/sls_scores.pkl...
Assigning 'bonafide' label to all files...
Loaded 512 scores.


Unnamed: 0,file_id,score,expected_label,detected_as_fake_eer,detected_as_fake_dcf
0,/data/audio_files/20250102225324-1735868804.68...,-0.000184,1,False,False
1,/data/audio_files/20250104163153-1736019058.11...,-5.350148,1,True,True
2,/data/audio_files/20241219201129-1734649648.19...,-0.060007,1,False,False
3,/data/audio_files/20241231190939-1735682910.29...,-14.297694,1,True,True
4,/data/audio_files/20241222180522-1734901413.22...,-11.873585,1,True,True


Processed 512 files


In [7]:
# Cell 4: Analyze detection performance

def analyze_detection_performance(results_df, threshold_key='eer_threshold'):
    """
    Analyze the detection performance on the dataset.
    
    Args:
        results_df: DataFrame with results
        threshold_key: Which threshold to use ('eer_threshold' or 'min_dcf_threshold')
    
    Returns:
        Dictionary with performance metrics
    """
    if results_df.empty:
        return {"error": "No results to analyze"}
    
    # Which detection column to use
    detection_col = 'detected_as_fake_eer' if threshold_key == 'eer_threshold' else 'detected_as_fake_dcf'
    
    # Prepare metrics dictionary
    metrics = {
        'threshold_used': CALIBRATED_THRESHOLDS[threshold_key],
        'threshold_type': threshold_key,
        'total_files': len(results_df)
    }
    
    # Check if we have expected labels
    if 'expected_label' in results_df.columns and not results_df['expected_label'].isna().all():
        # Calculate detection rate (how many expected fakes were detected as fake)
        total_spoofed = len(results_df[results_df['expected_label'] == 0])
        detected_spoofed = len(results_df[(results_df['expected_label'] == 0) & 
                                          (results_df[detection_col] == True)])
        
        # Calculate bonafide acceptance rate (if any bonafide samples)
        total_bonafide = len(results_df[results_df['expected_label'] == 1])
        accepted_bonafide = len(results_df[(results_df['expected_label'] == 1) & 
                                            (results_df[detection_col] == False)])
        
        # Add these metrics to the results
        metrics.update({
            'total_spoofed': total_spoofed,
            'detected_spoofed': detected_spoofed,
            'detection_rate': detected_spoofed / total_spoofed if total_spoofed > 0 else None,
            'total_bonafide': total_bonafide,
            'accepted_bonafide': accepted_bonafide,
            'bonafide_acceptance_rate': accepted_bonafide / total_bonafide if total_bonafide > 0 else None
        })
    else:
        # If no labels, just report detection statistics
        detected_as_fake = results_df[detection_col].sum()
        metrics.update({
            'detected_as_fake': detected_as_fake,
            'detected_as_fake_percentage': (detected_as_fake / len(results_df)) * 100,
            'note': "No expected labels provided. Detection statistics only."
        })
    
    return metrics

# Analyze performance using both thresholds
if not results_df.empty:
    eer_metrics = analyze_detection_performance(results_df, 'eer_threshold')
    dcf_metrics = analyze_detection_performance(results_df, 'min_dcf_threshold')
    
    # Display results
    print("\nPerformance using EER threshold:")
    for key, value in eer_metrics.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")
    
    print("\nPerformance using minDCF threshold:")
    for key, value in dcf_metrics.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")
        else:
            print(f"{key}: {value}")


Performance using EER threshold:
threshold_used: -3.5324
threshold_type: eer_threshold
total_files: 512
total_spoofed: 0
detected_spoofed: 0
detection_rate: None
total_bonafide: 512
accepted_bonafide: 201
bonafide_acceptance_rate: 0.3926

Performance using minDCF threshold:
threshold_used: -1.4866
threshold_type: min_dcf_threshold
total_files: 512
total_spoofed: 0
detected_spoofed: 0
detection_rate: None
total_bonafide: 512
accepted_bonafide: 166
bonafide_acceptance_rate: 0.3242
