Este agarra los audios en wav/, los procesa, y carga los spectrograms finales a spect/

In [None]:
import os, sys
import pandas as pd
import librosa as lbrs
import numpy as np
import noisereduce as nr
from PIL import Image


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import util

In [None]:
def load_spectrograms(df, source_dir, spectrogram_dir, output_csv_path, test_audios_dir=None, noise_reduce=False,
                    sr=32000, segment_sec=5.0, threshold_factor=0.5, mels=224, hoplen=512, nfft=2048):
    '''Gets the audio files from the source directory, processes them to create spectrograms, and saves 
    the spectrograms as images in the specified directory. It also creates a CSV file denoting all the 
    final spectrogram samples and their corresponding class IDs after generating new samples from segments.

    Args:
        df (pd.DataFrame): DataFrame containing at least 'filename' and 'class_id' columns.
        source_dir (str): Directory where the original audio (.ogg) files are located.
        spectrogram_dir (str): Directory to save generated spectrogram .png files.
        output_csv_path (str): Path to save the output CSV mapping 'filename' to 'class_id'.
        test_audios_dir (str, optional): Directory to save first 10 segmented audio samples as .wav files for inspection. Defaults to None.
        noise_reduce (bool, optional): Whether to apply noise reduction to the audio files. Defaults to False.
        sr (int, optional): Target sampling rate for audio loading. Defaults to 32000.
        segment_sec (float, optional): Duration (in seconds) of each extracted segment. Defaults to 5.0.
        threshold_factor (float, optional): Factor to multiply global RMS mean to set segment inclusion threshold. Defaults to 0.5.
        mels (int, optional): Number of mel bands for the spectrogram. Defaults to 224.
        hoplen (int, optional): Hop length for the spectrogram. Defaults to 512.
        nfft (int, optional): FFT window size for the spectrogram. Defaults to 2048.
    '''

    samples_per_segment = int(sr * segment_sec)
    rows = []
    saved_test_audios = 0  # Counter for test audio samples
    low_rms = 0  # Counter for segments with low RMS

    for _, row in df.iterrows():
        filename = row['filename']
        class_id = row['class_id']
        audio_path = os.path.join(source_dir, filename)

        try:
            y, srate = util.lbrs_loading(audio_path, sr=sr, mono=True)
        except:
            print(f"Error loading audio file {filename} from {source_dir}. Skipping.")
            continue

        threshold = util.get_rmsThreshold(y, frame_len=2048, hop_len=hoplen, thresh_factor=0.5)

        for start in range(0, len(y) - samples_per_segment + 1, samples_per_segment):
            segment = y[start:start + samples_per_segment]

            seg_rms = np.mean(lbrs.feature.rms(y=segment)[0])
            if seg_rms < threshold:
                low_rms += 1
                continue

            if noise_reduce:
                segment = util.reduce_noise_seg(segment, sr=srate, filename=filename, class_id=class_id)

            img, spec_path, spec_name = util.get_spec_image(segment, sr=srate, mels=mels, hoplen=hoplen, nfft=nfft,
                                                            filename=filename, start=start, spectrogram_dir=spectrogram_dir)
            Image.fromarray(img).save(spec_path)

            # Save first 10 segmented audios as test samples
            util.save_test_audios(segment, sr, test_audios_dir, filename, start)

            rows.append({'filename': spec_name, 'class_id': class_id})

    print(f"Total segments removed due to low RMS: {low_rms}")
    pd.DataFrame(rows).to_csv(output_csv_path, index=False)

In [3]:
# Define Paths
audios_dir = os.path.join('..', 'database', 'audio', 'dev')
final_data = pd.read_csv(os.path.join('..', 'database', 'meta', 'final_data.csv'))

spect_dir = os.path.join('..', 'database', 'spect')
spect2_dir = os.path.join('..', 'database', 'spect2')

output_csv = os.path.join('..', 'database', 'meta', 'final_spects.csv')
output_csv2 = os.path.join('..', 'database', 'meta', 'final_spects2.csv')

test_audios_dir = os.path.join('..', 'database', 'test_audios')
test_audios2_dir = os.path.join('..', 'database', 'test_audios2')

# Solo los que estan en dev/ para no encontrar errores
dev_df = final_data[final_data['filename'].apply(lambda f: os.path.exists(os.path.join(audios_dir, f)))].reset_index(drop=True)

In [4]:
util.clean_dir(spect_dir)
util.clean_dir(test_audios_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect_dir, output_csv, test_audios_dir=test_audios_dir)

Resetting ..\database\spect directory...
Resetting ..\database\test_audios directory...
Generating spectrograms from audio segments...


In [5]:
util.clean_dir(spect2_dir)
util.clean_dir(test_audios2_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect2_dir, output_csv2, test_audios_dir=test_audios2_dir, noise_reduce=True)

Resetting ..\database\spect2 directory...
Resetting ..\database\test_audios2 directory...
Generating spectrograms from audio segments...


Aca veo los tamaños

In [4]:
import random
amount = 20

# List all PNG files in spect_dir
spect_files = [f for f in os.listdir(spect_dir) if f.endswith('.png')]

# Fetch 5 random images
random_files = random.sample(spect_files, min(amount, len(spect_files)))

for fname in random_files:
    img_path = os.path.join(spect_dir, fname)
    img = Image.open(img_path)
    print(f"{fname}: {img.size}")

XC507275_1920000.png: (313, 224)
XC394358_640000.png: (313, 224)
XC38756_480000.png: (313, 224)
XC361122_160000.png: (313, 224)
XC287701_3680000.png: (313, 224)
XC379150_480000.png: (313, 224)
XC52001_320000.png: (313, 224)
XC112713_1440000.png: (313, 224)
XC32499_0.png: (313, 224)
XC366345_2880000.png: (313, 224)
XC144347_0.png: (313, 224)
XC14581_160000.png: (313, 224)
XC590623_1920000.png: (313, 224)
XC211770_320000.png: (313, 224)
XC615549_800000.png: (313, 224)
XC139495_3040000.png: (313, 224)
XC504476_640000.png: (313, 224)
XC586408_480000.png: (313, 224)
XC215722_0.png: (313, 224)
XC368365_1600000.png: (313, 224)
