In [2]:
import os
import librosa
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def analyze_durations(dataset_path, class_name):
    durations = []
    input_dir = os.path.join(dataset_path, class_name)
    for file in tqdm(
        os.listdir(input_dir), desc=f"Analyzing {dataset_path}\\{class_name} durations"
    ):
        audio_path = os.path.join(input_dir, file)
        try:
            duration = librosa.get_duration(filename=audio_path)
            durations.append(duration)
        except:
            print(f"Error reading {file}")
    return durations


In [4]:
HISTOGRAM_BIN_SIZE = 0.5
MIN_PLOT_DURATION_SEC = 0.0  # Change this to 'a'
MAX_PLOT_DURATION_SEC = 30.0

In [None]:
# FoR dataset
FOR_PATH = "F:\\Deepfake-Audio-Detector\\datasets\\for-dataset"
DES_PATH = "F:\\Deepfake-Audio-Detector\\scripts\\4_normalize_for_dataset_graphs"
set_types = ["train", "test", "val"]

bins = np.arange(
    MIN_PLOT_DURATION_SEC, 
    MAX_PLOT_DURATION_SEC + HISTOGRAM_BIN_SIZE, 
    HISTOGRAM_BIN_SIZE,
    )

for set_type in set_types:
    for class_name in ["real", "fake"]:
        durations = analyze_durations(FOR_PATH, os.path.join(set_type, class_name))
        plt.figure(figsize=(20, 6))
        sns.histplot(durations, bins=bins, kde=True)
        plt.title(f"Duration Distribution for {set_type} - {class_name}")
        plt.xlabel("Duration (seconds)")
        plt.ylabel("Frequency")
        plt.grid()
        plt.savefig(f"{DES_PATH}\\{set_type}_{class_name}_duration_distribution.png")
        plt.close()

Analyzing F:\Deepfake-Audio-Detector\datasets\for-dataset\train\real durations:   0%|          | 0/50130 [00:00<?, ?it/s]

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
Analyzing F:\Deepfake-Audio-Detector\datasets\for-dataset\train\real durations: 100%|██████████| 50130/50130 [08:26<00:00, 98.88it/s] 
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
Analyzing F:\Deepfake-Audio-Detector\datasets\for-dataset\train\fake durations: 100%|██████████| 32976/32976 [05:42<00:00, 96.33it/s] 
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
Analyzing F:\Deepfake-Audio-Detector\datasets\for-dataset\test\real durations: 100%|██████████| 6768/6768 [01:07<00:00, 100.52it/s]
	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_path)
Analyzing F:\Deepfake-Audio-Detector\datasets\for-dataset\test\fake durations: 100%|██████████| 2778/2778 [00:27<00:00, 99.64it/s] 
	This alias will be removed in version 1.0.
  duration = librosa.get_d

In [7]:
def filter_and_delete_short_files(dataset_path, min_duration):
    count_deleted = 0
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        
        try:
            duration = librosa.get_duration(filename=file_path)
            if duration < min_duration:
                os.remove(file_path)
                count_deleted += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    print(f"Deleted {count_deleted} files shorter than {min_duration} seconds in {dataset_path}")
    
MIN_DURATION = 1.5

base_dir = "F:\\Deepfake-Audio-Detector\\datasets\\for-dataset"
set_types = ["train", "test", "val"]

for set_type in set_types:
    for class_name in ["real", "fake"]:
        detele_dataset_path = os.path.join(base_dir, set_type, class_name)
        filter_and_delete_short_files(detele_dataset_path, MIN_DURATION)
        print(f"Number of files available in {set_type}\\{class_name}: {len(os.listdir(detele_dataset_path))}")

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=file_path)


Deleted 908 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\train\real
Number of files available in train\real: 49222
Deleted 12527 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\train\fake
Number of files available in train\fake: 20449
Deleted 28 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\test\real
Number of files available in test\real: 6740
Deleted 406 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\test\fake
Number of files available in test\fake: 2372
Deleted 20 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\val\real
Number of files available in val\real: 3354
Deleted 183 files shorter than 1.5 seconds in F:\Deepfake-Audio-Detector\datasets\for-dataset\val\fake
Number of files available in val\fake: 1216
