Este agarra los audios en wav/, los procesa, y carga los spectrograms finales a spect/

In [1]:
import os, sys
import pandas as pd
import librosa as lbrs
import numpy as np
from PIL import Image

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import util

In [2]:
def load_spectrograms(df, source_dir, spectrogram_dir, output_csv_path, sr=32000, segment_sec=5.0, threshold_factor=0.5):
    '''Gets the audio files from the source directory, processes them to create spectrograms, and saves 
    the spectrograms as images in the specified directory. It also creates a CSV file denoting all the 
    final spectrogram samples and their corresponding class IDs after generating new samples from segments.

    Args:
        df (pd.DataFrame): DataFrame containing at least 'filename' and 'class_id' columns.
        source_dir (str): Directory where the original audio (.ogg) files are located.
        spectrogram_dir (str): Directory to save generated spectrogram .png files.
        output_csv_path (str): Path to save the output CSV mapping 'filename' to 'class_id'.
        sr (int, optional): Target sampling rate for audio loading. Defaults to 32000.
        segment_sec (float, optional): Duration (in seconds) of each extracted segment. Defaults to 5.0.
        threshold_factor (float, optional): Factor to multiply global RMS mean to set segment inclusion threshold. Defaults to 0.5.
    '''

    samples_per_segment = int(sr * segment_sec)
    rows = []

    for _, row in df.iterrows():
        filename = row['filename']
        class_id = row['class_id']
        audio_path = os.path.join(source_dir, filename)

        try:
            y, _ = lbrs.load(audio_path, sr=sr, mono=True)
        except:
            continue

        rms = lbrs.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
        threshold = threshold_factor * np.mean(rms)

        for start in range(0, len(y) - samples_per_segment + 1, samples_per_segment):
            segment = y[start:start + samples_per_segment]
            seg_rms = np.mean(lbrs.feature.rms(y=segment)[0])
            if seg_rms < threshold:
                continue

            # Updated spectrogram parameters for 224x224 output
            spec = lbrs.feature.melspectrogram(
                y=segment, 
                sr=sr, 
                n_mels=224,
                hop_length=512,
                n_fft=2048
            )
            spec_db = lbrs.power_to_db(spec, ref=np.max)
            norm_spec = (spec_db - spec_db.min()) / (spec_db.max() - spec_db.min())
            img = (norm_spec * 255).astype(np.uint8)

            spec_filename = f"{os.path.splitext(filename)[0]}_{start}.png"
            spec_path = os.path.join(spectrogram_dir, spec_filename)
            Image.fromarray(img).save(spec_path)

            rows.append({'filename': spec_filename, 'class_id': class_id})

    pd.DataFrame(rows).to_csv(output_csv_path, index=False)

In [3]:
# Define Paths
audios_dir = os.path.join('..', 'database', 'audio', 'dev')
spect_dir = os.path.join('..', 'database', 'spect')
output_csv = os.path.join('..', 'database', 'meta', 'final_spects.csv')
final_data = pd.read_csv(os.path.join('..', 'database', 'meta', 'final_data.csv'))

# Solo los que estan en dev/ para no encontrar errores
dev_df = final_data[final_data['filename'].apply(lambda f: os.path.exists(os.path.join(audios_dir, f)))].reset_index(drop=True)

In [4]:
print("Resetting spectrogram directory...")
util.clean_dir(spect_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect_dir, output_csv)

Resetting spectrogram directory...
Generating spectrograms from audio segments...


Aca veo los tamaños

In [5]:
import random
amount = 50

# List all PNG files in spect_dir
spect_files = [f for f in os.listdir(spect_dir) if f.endswith('.png')]

# Fetch 5 random images
random_files = random.sample(spect_files, min(amount, len(spect_files)))

for fname in random_files:
    img_path = os.path.join(spect_dir, fname)
    img = Image.open(img_path)
    print(f"{fname}: {img.size}")

XC591189_640000.png: (313, 224)
XC355657_320000.png: (313, 224)
XC584552_0.png: (313, 224)
XC299042_480000.png: (313, 224)
XC1710_320000.png: (313, 224)
XC510944_640000.png: (313, 224)
XC244497_2880000.png: (313, 224)
XC540445_1120000.png: (313, 224)
XC506163_800000.png: (313, 224)
XC592835_160000.png: (313, 224)
XC29073_800000.png: (313, 224)
XC394535_0.png: (313, 224)
XC503247_1120000.png: (313, 224)
XC415207_0.png: (313, 224)
XC358288_160000.png: (313, 224)
XC143491_1440000.png: (313, 224)
XC537786_1120000.png: (313, 224)
XC204203_960000.png: (313, 224)
XC390938_960000.png: (313, 224)
XC391504_640000.png: (313, 224)
XC444441_160000.png: (313, 224)
XC51010_0.png: (313, 224)
XC288940_1280000.png: (313, 224)
XC390834_960000.png: (313, 224)
XC510944_320000.png: (313, 224)
XC508606_1440000.png: (313, 224)
XC512739_480000.png: (313, 224)
XC546406_800000.png: (313, 224)
XC299042_640000.png: (313, 224)
XC351272_480000.png: (313, 224)
XC506234_1120000.png: (313, 224)
XC355657_1920000.png: (3