In [5]:
import os
from pydub import AudioSegment, silence
from tqdm import tqdm
import random
import numpy as np

random.seed(42)  
np.random.seed(42)

In [2]:
def standardize_audio_duration_custom(
    input_path, 
    output_dir, 
    target_duration=3.0, 
    max_chunk=4.5, 
    target_sr=16000
):
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(target_sr).set_channels(1)

        audio_len_ms = len(audio)
        target_len_ms = int(target_duration * 1000)
        max_chunk_ms = int(max_chunk * 1000)

        base_name = os.path.splitext(os.path.basename(input_path))[0]

        if audio_len_ms < target_len_ms:
            # Case 1: < 3s → padding
            padded = audio + AudioSegment.silent(duration=target_len_ms - audio_len_ms)
            output_filename = f"{base_name}_part1.wav"
            padded.export(os.path.join(output_dir, output_filename), format="wav")
            print(f"Saved padded: {output_filename}")

        elif audio_len_ms < max_chunk_ms:
            # Case 2: 3s ≤ audio < 4.5s → random 3s
            start = random.randint(0, audio_len_ms - target_len_ms)
            segment = audio[start : start + target_len_ms]
            output_filename = f"{base_name}_part1.wav"
            segment.export(os.path.join(output_dir, output_filename), format="wav")
            print(f"Saved random short: {output_filename}")

        else:
            # Case 3: ≥ 4.5s → chia thành các khối 4.5s, chọn random 3s trong mỗi
            num_chunks = audio_len_ms // max_chunk_ms

            for i in range(num_chunks):
                chunk_start = i * max_chunk_ms
                chunk = audio[chunk_start : chunk_start + max_chunk_ms]
                if len(chunk) < target_len_ms:
                    continue  

                max_start = len(chunk) - target_len_ms
                start = random.randint(0, max_start)
                segment = chunk[start : start + target_len_ms]

                output_filename = f"{base_name}_part{i + 1}.wav"
                segment.export(os.path.join(output_dir, output_filename), format="wav")
                print(f"Saved chunk random: {output_filename}")

    except Exception as e:
        print(f"Error processing {input_path}: {e}")

def process_directory(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    audio_files = [f for f in os.listdir(input_dir) if f.endswith('.wav')]
    print(f"Found {len(audio_files)} audio files to process.")

    for audio_file in tqdm(audio_files, desc="Processing files"):
        input_path = os.path.join(input_dir, audio_file)
        standardize_audio_duration_custom(input_path, output_dir)

In [None]:
INPUT_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\for-dataset"
OUTPUT_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"

for set_type in ["train", "test", "val"]:
    for label in ["real", "fake"]:
        input_set_dir = os.path.join(INPUT_DIR, set_type, label)
        output_set_dir = os.path.join(OUTPUT_DIR, set_type, label)
        process_directory(input_set_dir, output_set_dir)

In [4]:
def count_audio_files(root_dir, extensions=(".wav",)):
    for set_type in ["train", "test", "val"]:
        for label in ["real", "fake"]:
            folder_path = os.path.join(root_dir, set_type, label)
            if not os.path.exists(folder_path):
                print(f"❌ Not found: {folder_path}")
                continue

            file_count = sum(1 for f in os.listdir(folder_path) if f.lower().endswith(extensions))
            print(f"{set_type.upper()} / {label:<5}: {file_count} files")


DATASET_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"
count_audio_files(DATASET_DIR)


TRAIN / real : 52143 files
TRAIN / fake : 20453 files
TEST / real : 7211 files
TEST / fake : 2372 files
VAL / real : 3579 files
VAL / fake : 1216 files


In [None]:
def detect_silence_ratio(file_path, silence_thresh=-40, min_silence_len=300):
    audio = AudioSegment.from_file(file_path).set_channels(1)
    silences = silence.detect_silence(
        audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh
    )
    total_silence = sum((end - start) for start, end in silences)
    return total_silence / len(audio)


def count_silent_files(
    root_dir,
    threshold=0.7,
    silence_thresh=-40,
    min_silence_len=300,
    log_file="5_silent_files_log.txt",
):
    print(f"{'SET':<6} / {'LABEL':<5} : TOTAL  | SILENT (> {threshold * 100:.0f}%)")
    print("-" * 45)

    silent_file_paths = []

    for set_type in ["train", "test", "val"]:
        for label in ["real", "fake"]:
            folder_path = os.path.join(root_dir, set_type, label)
            if not os.path.exists(folder_path):
                print(f"❌ Not found: {folder_path}")
                continue

            files = [f for f in os.listdir(folder_path) if f.lower().endswith(".wav")]
            total = len(files)
            silent_count = 0

            for f in files:
                file_path = os.path.join(folder_path, f)
                try:
                    ratio = detect_silence_ratio(file_path, silence_thresh, min_silence_len)
                    if ratio >= threshold:
                        silent_count += 1
                        silent_file_paths.append(file_path)
                except Exception as e:
                    print(f"Error in {file_path}: {e}")

            print(f"{set_type.upper():<6} / {label:<5} : {total:<6} | {silent_count}")
            
    with open(log_file, "w", encoding="utf-8") as f:
        for path in silent_file_paths:
            f.write(path + "\n")

    print(f"\n📝 Silent file paths saved to: {log_file}")

DATASET_DIR = "F:\\Deepfake-Audio-Detector\\datasets\\raw_final_dataset"
count_silent_files(DATASET_DIR)

SET    / LABEL : TOTAL  | SILENT (> 70%)
---------------------------------------------
TRAIN  / real  : 52143  | 695
TRAIN  / fake  : 20453  | 14
TEST   / real  : 7211   | 178
TEST   / fake  : 2372   | 2
VAL    / real  : 3579   | 81
VAL    / fake  : 1216   | 0

📝 Silent file paths saved to: silent_files_log.txt


In [None]:
def delete_files_from_list(txt_path):
    if not os.path.exists(txt_path):
        print(f"❌ File not found: {txt_path}")
        return

    with open(txt_path, "r", encoding="utf-8") as f:
        paths = [line.strip() for line in f if line.strip()]

    deleted = 0
    errors = 0

    for path in paths:
        if os.path.exists(path):
            try:
                os.remove(path)
                deleted += 1
            except Exception as e:
                print(f"❌ Error deleting {path}: {e}")
                errors += 1
        else:
            print(f"⚠️ File not found: {path}")
            errors += 1

    print(f"\n✅ Deleted: {deleted} file(s)")
    if errors:
        print(f"⚠️ Errors or missing files: {errors}")

delete_files_from_list("5_silent_files_log.txt")



✅ Deleted: 970 file(s)
