In [1]:
import os
import shutil
import random
import hashlib
from pydub import AudioSegment
from pydub.silence import detect_silence
import hashlib
import numpy as np

random.seed(42)
np.random.seed(42)

In [17]:
RAW_DATASET_PATH = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"

def count_audio_needed_for_balance(set_type, base_dir = RAW_DATASET_PATH):
    folder_path_real = os.path.join(base_dir, set_type, "real")
    folder_path_fake = os.path.join(base_dir, set_type, "fake")
    
    if not os.path.exists(folder_path_real):
        print(f"Not found: {folder_path_real}")
        return 
    
    if not os.path.exists(folder_path_fake):
        print(f"Not found: {folder_path_fake}")
        return 
    
    real_file_count = sum(1 for f in os.listdir(folder_path_real))
    fake_file_count = sum(1 for f in os.listdir(folder_path_fake))
    print(f"{set_type.upper()} \\ real\t: {real_file_count} files")
    print(f"{set_type.upper()} \\ fake\t: {fake_file_count} files")
    print(f"Need {real_file_count - fake_file_count} to balance {set_type.upper()} data")
    
    return real_file_count - fake_file_count
    
train_needed = count_audio_needed_for_balance("train")
val_needed = count_audio_needed_for_balance("val")
test_needed = count_audio_needed_for_balance("test")


TRAIN \ real	: 51448 files
TRAIN \ fake	: 20439 files
Need 31009 to balance TRAIN data
VAL \ real	: 3498 files
VAL \ fake	: 1216 files
Need 2282 to balance VAL data
TEST \ real	: 7033 files
TEST \ fake	: 2370 files
Need 4663 to balance TEST data


In [18]:
DFADD_PATH = "F:\\Deepfake-Audio-Detector\\datasets\\dfadd-dataset\\"
AUDIO_FILE_EXTENSIONS = [".wav", ".flac", ".mp3", ".m4a"]

DFADD_STD_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\dfadd_standardized\\"
CURRENT_DIR = "/kaggle/working/standardized_3s"

def get_audio_files(directory, extensions):
    return [f for f in os.listdir(directory) if any(f.endswith(ext) for ext in extensions)]

DFADD_FAKE_PATH = os.path.join(DFADD_PATH, "fake")
fake_files = get_audio_files(DFADD_FAKE_PATH, AUDIO_FILE_EXTENSIONS)
print(f"DFADD Real Samples: {len(fake_files)}")

DFADD Real Samples: 163500


In [19]:
# Step 2: Balence data utils

# Silence checker
def is_mostly_silent(
    audio_segment, threshold_db=-40, min_silence_len_ms=300, silence_ratio=0.7
):
    try:
        silent_ranges = detect_silence(
            audio_segment,
            min_silence_len=min_silence_len_ms,
            silence_thresh=threshold_db,
        )
        total_silence_ms = sum((end - start) for start, end in silent_ranges)
        return (total_silence_ms / len(audio_segment)) > silence_ratio
    except Exception:
        return False

# Dulicated checker
def compute_audio_hash(audio_segment):
    raw_data = audio_segment.raw_data
    return hashlib.md5(raw_data).hexdigest()

def sample_and_standardize(
    source_dir,
    output_dir,
    target_sr=16000,
    target_duration=3.0,
    max_chunk=4.5,
    desired_count=40000,
    max_trials=500000,
    silence_threshold_db = -40,
    min_silence_len_ms = 300,
    silence_ratio=0.7
):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    SUPPORTED_EXTENSIONS = (".wav", ".flac", ".mp3", ".m4a")
    files = [f for f in os.listdir(source_dir) if f.lower().endswith(SUPPORTED_EXTENSIONS)]
    seen_hashes = set()
    collected = 0
    trials = 0
    target_len_ms = int(target_duration * 1000)
    max_chunk_ms = int(max_chunk * 1000)

    while collected < desired_count and trials < max_trials:
        trials += 1
        file = random.choice(files)
        file_path = os.path.join(source_dir, file)
        
        try:
            audio = AudioSegment.from_file(file_path)
            audio = audio.set_frame_rate(target_sr).set_channels(1)
            audio_len_ms = len(audio)
            if audio_len_ms < target_len_ms:
                continue

            segment = None
            if audio_len_ms <= max_chunk_ms:
                max_start = audio_len_ms - target_len_ms
                if max_start <= 0:
                    continue
                start_ms = random.randint(0, max_start)
                segment = audio[start_ms : start_ms + target_len_ms]
            else:
                chunk_start_ms = random.randint(0, audio_len_ms - max_chunk_ms)
                chunk = audio[chunk_start_ms : chunk_start_ms + max_chunk_ms]
                if is_mostly_silent(
                    chunk, 
                    threshold_db=silence_threshold_db, 
                    min_silence_len_ms=min_silence_len_ms,
                    silence_ratio=silence_ratio
                ):
                    continue
                if len(chunk) < target_len_ms:
                    continue

                max_start_in_chunk = len(chunk) - target_len_ms
                start_in_chunk = random.randint(0, max_start_in_chunk)
                segment = chunk[start_in_chunk : start_in_chunk + target_len_ms]

            if segment is None:
                continue
            if is_mostly_silent(
                segment,
                threshold_db=silence_threshold_db,
                min_silence_len_ms=min_silence_len_ms,
                silence_ratio=silence_ratio,
            ):
                continue
            if len(segment) > target_len_ms:
                segment = segment[:target_len_ms]
            elif len(segment) < target_len_ms:
                padding_needed = target_len_ms - len(segment)
                segment = (
                    segment.set_frame_rate(target_sr)
                    .set_channels(1)
                    .append(
                        AudioSegment.silent(
                            duration=padding_needed, frame_rate=target_sr
                        ),
                        crossfade=0,
                    )
                )
                
            segment_hash = compute_audio_hash(segment)
            if segment_hash in seen_hashes:
                continue

            seen_hashes.add(segment_hash)
            original_file_base = os.path.splitext(file)[0]
            output_name = f"{original_file_base}_s{collected + 1}.wav"
            output_path = os.path.join(output_dir, output_name)
            segment.export(output_path, format="wav")
            collected += 1
            if collected % 100 == 0:
                print(f"✅ Collected: {collected}/{desired_count}, Trials: {trials}")
        except Exception:
            print(f"❌ Error with {file_path}")

    print(f"\n🎯 Done! Collected {collected} audio segments out of {trials} trials.")
    if collected < desired_count:
        print(
            f"Warning: Could not collect desired count. Only {collected} segments were gathered."
        )


In [20]:
sample_and_standardize(
    source_dir="F:\\Deepfake-Audio-Detector\\datasets\\dfadd-dataset\\fake",
    output_dir="F:\\Deepfake-Audio-Detector\\datasets\\dfadd-standardized",
    target_duration=3.0,
    max_chunk=4.5,
    desired_count=40000,
    max_trials=500000,
    silence_threshold_db=-40,
    min_silence_len_ms=300,
    silence_ratio=0.7
)

❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p360_006_pflow.wav
✅ Collected: 100/40000, Trials: 101
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p238_069_pflow.wav
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p374_261_pflow.wav
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p310_197_pflow.wav
✅ Collected: 200/40000, Trials: 204
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p243_079_pflow.wav
✅ Collected: 300/40000, Trials: 305
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p248_220_pflow.wav
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p347_219_pflow.wav
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p306_253_pflow.wav
✅ Collected: 400/40000, Trials: 408
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p294_252_pflow.wav
❌ Error with F:\Deepfake-Audio-Detector\datasets\dfadd-dataset\fake\p225_

In [None]:
train_needed = count_audio_needed_for_balance("train")
val_needed = count_audio_needed_for_balance("val")
test_needed = count_audio_needed_for_balance("test")

DESTINATION_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"
SOURCE_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\dfadd-standardized"

files = os.listdir(SOURCE_DIR)
random.shuffle(files)

total_needed = train_needed + val_needed + test_needed #type: ignore
assert total_needed <= len(files), "Not enough files to split!"

train_files = files[:train_needed]
val_files = files[train_needed : train_needed + val_needed]  # type: ignore
test_files = files[train_needed + val_needed : train_needed + val_needed + test_needed]  # type: ignore

train_output_dir = os.path.join(DESTINATION_DIR, "train", "fake")
val_output_dir = os.path.join(DESTINATION_DIR, "val", "fake")
test_output_dir = os.path.join(DESTINATION_DIR, "test", "fake")

for file in train_files:
    shutil.move(os.path.join(SOURCE_DIR, file), train_output_dir)
for file in val_files:
    shutil.move(os.path.join(SOURCE_DIR, file), val_output_dir)
for file in test_files:
    shutil.move(os.path.join(SOURCE_DIR, file), test_output_dir)


TRAIN \ real	: 51448 files
TRAIN \ fake	: 20439 files
Need 31009 to balance TRAIN data
VAL \ real	: 3498 files
VAL \ fake	: 1216 files
Need 2282 to balance VAL data
TEST \ real	: 7033 files
TEST \ fake	: 2370 files
Need 4663 to balance TEST data


In [None]:
SOURCE_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"

for set_type in ["train", "test", "val"]:
    extra_file = len([f for f in os.listdir(os.path.join(SOURCE_DIR, set_type)) if f.endswith('.wav')])
    fake_file = len(os.listdir(os.path.join(SOURCE_DIR, set_type, "fake")))
    real_file = len(os.listdir(os.path.join(SOURCE_DIR, set_type, "real"))) 
    n_fake_file = extra_file + fake_file
    print(f"\n{set_type.upper()} have {real_file} real_file and {n_fake_file} fake_file")
    print(f"Need {real_file - n_fake_file} to balance")               


TRAIN have 51448 real_file and 51448 fake_file
Need 0 to balance

TEST have 7033 real_file and 7033 fake_file
Need 0 to balance

VAL have 3498 real_file and 3498 fake_file
Need 0 to balance


In [None]:
SOURCE_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"

for set_type in ["train", "test", "val"]:
    source_dir = os.path.join(SOURCE_DIR, set_type)
    destination_path = os.path.join(source_dir, "fake")
    
    file_names = [f for f in os.listdir(source_dir)
                  if f.endswith('.wav') and os.path.isfile(os.path.join(source_dir, f))]

    for file_name in file_names:
        file_path = os.path.join(source_dir, file_name)
        shutil.move(file_path, destination_path)

    print(f"Done move file in {set_type}")

Done move file in {set_type}
Done move file in {set_type}
Done move file in {set_type}


In [5]:
SOURCE_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"

for set_type in ["train", "test", "val"]:
    fake_file = len(os.listdir(os.path.join(SOURCE_DIR, set_type, "fake")))
    real_file = len(os.listdir(os.path.join(SOURCE_DIR, set_type, "real")))
    print(
        f"\n{set_type.upper()} have {real_file} real_file and {fake_file} fake_file"
    )



TRAIN have 51448 real_file and 51448 fake_file

TEST have 7033 real_file and 7033 fake_file

VAL have 3498 real_file and 3498 fake_file
