In [3]:
import os
import librosa
import numpy as np

# Base directory containing the audio snippets
base_dir = "raw_data"

# Subfolders to analyze
subfolders = ["test", "train", "valid"]

# Iterate over each subfolder
for subfolder in subfolders:
    # Directory containing the audio snippets for the current subfolder
    audio_dir = os.path.join(base_dir, subfolder)

    # List to store the duration of each audio file
    durations = []

    # Iterate over all files in the directory
    for filename in os.listdir(audio_dir):
        if filename.endswith(".wav"):
            # Load the audio file
            filepath = os.path.join(audio_dir, filename)
            y, sr = librosa.load(filepath, sr=None)

            # Calculate the duration and append to the list
            duration = librosa.get_duration(y=y, sr=sr)
            durations.append(duration)

    # Calculate descriptive statistics
    mean_duration = np.mean(durations)
    median_duration = np.median(durations)
    std_duration = np.std(durations)
    min_duration = np.min(durations)
    max_duration = np.max(durations)

    # Print the descriptive statistics for the current subfolder
    print(f"Descriptive statistics for {subfolder}:")
    print(f"Mean duration: {mean_duration:.2f} seconds")
    print(f"Median duration: {median_duration:.2f} seconds")
    print(f"Standard deviation: {std_duration:.2f} seconds")
    print(f"Minimum duration: {min_duration:.2f} seconds")
    print(f"Maximum duration: {max_duration:.2f} seconds")
    print()  # Add a newline for better readability


Descriptive statistics for test:
Mean duration: 3.89 seconds
Median duration: 3.26 seconds
Standard deviation: 2.17 seconds
Minimum duration: 0.92 seconds
Maximum duration: 12.70 seconds

Descriptive statistics for train:
Mean duration: 4.03 seconds
Median duration: 3.43 seconds
Standard deviation: 3.35 seconds
Minimum duration: 0.52 seconds
Maximum duration: 39.60 seconds

Descriptive statistics for valid:
Mean duration: 3.90 seconds
Median duration: 3.56 seconds
Standard deviation: 2.19 seconds
Minimum duration: 0.72 seconds
Maximum duration: 14.34 seconds



In [4]:
# Initialize a dictionary to store the file counts
file_counts = {}

# Iterate over each subfolder to count the number of files
for subfolder in subfolders:
    # Directory containing the audio snippets for the current subfolder
    audio_dir = os.path.join(base_dir, subfolder)

    # Count the number of .wav files in the directory
    file_count = len(
        [filename for filename in os.listdir(audio_dir) if filename.endswith(".wav")]
    )
    file_counts[subfolder] = file_count

# Calculate the total number of files
total_files = sum(file_counts.values())

# Print the number of files in each subfolder and the total number
print("Number of files in each subfolder:")
for subfolder, count in file_counts.items():
    print(f"{subfolder}: {count} files")
print(f"Total number of files: {total_files} files")


Number of files in each subfolder:
test: 164 files
train: 770 files
valid: 162 files
Total number of files: 1096 files
