Este agarra los audios en wav/, los procesa, y carga los spectrograms finales a spect/

In [None]:
import os, sys
import pandas as pd
import librosa as lbrs
import numpy as np
import noisereduce as nr
from PIL import Image


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import util

In [None]:
def load_spectrograms(df, source_dir, spectrogram_dir, output_csv_path, test_audios_dir=None, noise_reduce=False,
                    sr=32000, segment_sec=5.0, threshold_factor=0.5, mels=224, hoplen=512, nfft=2048):
    '''Gets the audio files from the source directory, processes them to create spectrograms, and saves 
    the spectrograms as images in the specified directory. It also creates a CSV file denoting all the 
    final spectrogram samples and their corresponding class IDs after generating new samples from segments.

    Args:
        df (pd.DataFrame): DataFrame containing at least 'filename' and 'class_id' columns.
        source_dir (str): Directory where the original audio (.ogg) files are located.
        spectrogram_dir (str): Directory to save generated spectrogram .png files.
        output_csv_path (str): Path to save the output CSV mapping 'filename' to 'class_id'.
        test_audios_dir (str, optional): Directory to save first 10 segmented audio samples as .wav files for inspection. Defaults to None.
        noise_reduce (bool, optional): Whether to apply noise reduction to the audio files. Defaults to False.
        sr (int, optional): Target sampling rate for audio loading. Defaults to 32000.
        segment_sec (float, optional): Duration (in seconds) of each extracted segment. Defaults to 5.0.
        threshold_factor (float, optional): Factor to multiply global RMS mean to set segment inclusion threshold. Defaults to 0.5.
        mels (int, optional): Number of mel bands for the spectrogram. Defaults to 224.
        hoplen (int, optional): Hop length for the spectrogram. Defaults to 512.
        nfft (int, optional): FFT window size for the spectrogram. Defaults to 2048.
    '''

    samples_per_segment = int(sr * segment_sec)
    rows = []
    saved_test_audios = 0  # Counter for test audio samples

    for _, row in df.iterrows():
        filename = row['filename']
        class_id = row['class_id']
        audio_path = os.path.join(source_dir, filename)

        try:
            y, srate = lbrs.load(audio_path, sr=sr, mono=True)
        except:
            print(f"Error loading audio file {filename} from {source_dir}. Skipping.")
            continue

        rms = lbrs.feature.rms(y=y, frame_length=2048, hop_length=hoplen)[0]
        threshold = threshold_factor * np.mean(rms)

        for start in range(0, len(y) - samples_per_segment + 1, samples_per_segment):
            segment = y[start:start + samples_per_segment]
            seg_rms = np.mean(lbrs.feature.rms(y=segment)[0])
            if seg_rms < threshold:
                continue

            if noise_reduce:
                try:
                    segment = nr.reduce_noise(y=segment, sr=srate, stationary=False)
                except RuntimeWarning as e:
                    print(f"RuntimeWarning while reducing noise for {filename} from {class_id}: {e}")

            # Create spectrogram
            spec = lbrs.feature.melspectrogram(
                y=segment, 
                sr=sr, 
                n_mels=mels,
                hop_length=hoplen,
                n_fft=nfft
            )
            spec_db = lbrs.power_to_db(spec, ref=np.max)
            norm_spec = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
            img = (norm_spec * 255).astype(np.uint8)

            spec_filename = f"{os.path.splitext(filename)[0]}_{start}.png"
            spec_path = os.path.join(spectrogram_dir, spec_filename)
            Image.fromarray(img).save(spec_path)

            # Save first 10 segmented audios as test samples
            if test_audios_dir is not None and saved_test_audios < 10:
                import soundfile as sf
                os.makedirs(test_audios_dir, exist_ok=True)
                test_audio_filename = f"{os.path.splitext(filename)[0]}_{start}_test.wav"
                test_audio_path = os.path.join(test_audios_dir, test_audio_filename)
                sf.write(test_audio_path, segment, sr)
                saved_test_audios += 1

            rows.append({'filename': spec_filename, 'class_id': class_id})

    pd.DataFrame(rows).to_csv(output_csv_path, index=False)

In [None]:
# Define Paths
audios_dir = os.path.join('..', 'database', 'audio', 'dev')
final_data = pd.read_csv(os.path.join('..', 'database', 'meta', 'final_data.csv'))

spect_dir = os.path.join('..', 'database', 'spect')
spect2_dir = os.path.join('..', 'database', 'spect2')

output_csv = os.path.join('..', 'database', 'meta', 'final_spects.csv')
output_csv2 = os.path.join('..', 'database', 'meta', 'final_spects2.csv')

test_audios_dir = os.path.join('..', 'database', 'test_audios')
test_audios2_dir = os.path.join('..', 'database', 'test_audios2')

# Solo los que estan en dev/ para no encontrar errores
dev_df = final_data[final_data['filename'].apply(lambda f: os.path.exists(os.path.join(audios_dir, f)))].reset_index(drop=True)

In [None]:
util.clean_dir(spect_dir)
util.clean_dir(test_audios_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect_dir, output_csv, test_audios_dir=test_audios_dir)

In [None]:
util.clean_dir(spect2_dir)
util.clean_dir(test_audios2_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect2_dir, output_csv2, test_audios_dir=test_audios2_dir, noise_reduce=True)

Aca veo los tamaños

In [5]:
import random
amount = 50

# List all PNG files in spect_dir
spect_files = [f for f in os.listdir(spect_dir) if f.endswith('.png')]

# Fetch 5 random images
random_files = random.sample(spect_files, min(amount, len(spect_files)))

for fname in random_files:
    img_path = os.path.join(spect_dir, fname)
    img = Image.open(img_path)
    print(f"{fname}: {img.size}")

XC556170_320000.png: (313, 224)
XC475609_640000.png: (313, 224)
XC391056_320000.png: (313, 224)
XC603277_640000.png: (313, 224)
XC396871_320000.png: (313, 224)
XC558884_2400000.png: (313, 224)
XC390971_2880000.png: (313, 224)
XC340516_160000.png: (313, 224)
XC139495_1920000.png: (313, 224)
XC453998_320000.png: (313, 224)
XC51046_480000.png: (313, 224)
XC48760_320000.png: (313, 224)
XC583621_480000.png: (313, 224)
XC609188_800000.png: (313, 224)
XC287701_5600000.png: (313, 224)
XC32484_0.png: (313, 224)
XC32469_960000.png: (313, 224)
XC112712_960000.png: (313, 224)
XC470185_320000.png: (313, 224)
XC288940_960000.png: (313, 224)
XC364127_160000.png: (313, 224)
XC51698_160000.png: (313, 224)
XC48730_320000.png: (313, 224)
XC112733_480000.png: (313, 224)
XC441550_1120000.png: (313, 224)
XC288940_1760000.png: (313, 224)
XC267908_1280000.png: (313, 224)
XC439642_320000.png: (313, 224)
XC287701_320000.png: (313, 224)
XC154387_960000.png: (313, 224)
XC615550_800000.png: (313, 224)
XC1710_16000