In [5]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Indexing and Loading the Dataset

We scan the folder structure to create a metadata DataFrame, capturing file paths, labels (real/fake), and sources. This provides a centralized view of the dataset, essential for handling large video collections. For Celeb-DF-v2, this reveals class imbalance (far more fakes than reals), which is common in deepfake datasets and can bias models toward over-detecting fakes. We also integrate the test list to separate evaluation data early, preventing data leakage.

In [14]:
base_path = 'dataset/Celeb-DF-v2'  # Replace with your actual path

# Define folders and labels
folders = {
    'real': ['Celeb-real', 'YouTube-real'],
    'fake': ['Celeb-synthesis']
}

# Collect file metadata
file_records = []
for label, subfolders in folders.items():
    for subfolder in subfolders:
        full_path = os.path.join(base_path, subfolder)
        if os.path.exists(full_path):
            for fname in os.listdir(full_path):
                if fname.endswith('.mp4'):
                    file_records.append({
                        'filename': fname,
                        'folder': subfolder,
                        'label': label,
                        'filepath': os.path.join(full_path, fname)
                    })
        else:
            print(f"Warning: Folder {full_path} not found.")

metadata = pd.DataFrame(file_records)

# Load and tag test videos
test_file = os.path.join(base_path, 'List_of_testing_videos.txt')
if os.path.exists(test_file):
    with open(test_file, 'r') as f:
        test_videos = {line.strip() for line in f if line.strip()}
    metadata['is_test'] = metadata.apply(
        lambda row: f"{row['folder']}/{row['filename']}" in test_videos, axis=1
    )
else:
    print("Warning: List_of_testing_videos.txt not found.")
    metadata['is_test'] = False

# Save metadata for reuse
metadata.to_csv('celeb_df_metadata.csv', index=False)
print(metadata.head())
print(f"Total videos: {len(metadata)}")
len(metadata[metadata['is_test'] == False])

        filename      folder label  \
0  id60_0005.mp4  Celeb-real  real   
1   id2_0004.mp4  Celeb-real  real   
2  id43_0001.mp4  Celeb-real  real   
3  id60_0002.mp4  Celeb-real  real   
4  id35_0000.mp4  Celeb-real  real   

                                       filepath  is_test  
0  dataset/Celeb-DF-v2/Celeb-real/id60_0005.mp4    False  
1   dataset/Celeb-DF-v2/Celeb-real/id2_0004.mp4    False  
2  dataset/Celeb-DF-v2/Celeb-real/id43_0001.mp4    False  
3  dataset/Celeb-DF-v2/Celeb-real/id60_0002.mp4    False  
4  dataset/Celeb-DF-v2/Celeb-real/id35_0000.mp4    False  
Total videos: 6529


6529