Este agarra los audios en wav/, los procesa, y carga los spectrograms finales a spect/

In [None]:
import os, sys
import pandas as pd
import librosa as lbrs
import numpy as np
import noisereduce as nr
import soundfile as sf
from PIL import Image


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import util

In [None]:
# OLD!
# def load_spectrograms(df, source_dir, spectrogram_dir, output_csv_path, test_audios_dir=None, noise_reduce=True,
#                     sr=32000, segment_sec=5.0, threshold_factor=0.5, mels=224, hoplen=512, nfft=2048):
#     '''Gets the audio files from the source directory, processes them to create spectrograms, and saves 
#     the spectrograms as images in the specified directory. It also creates a CSV file denoting all the 
#     final spectrogram samples and their corresponding class IDs after generating new samples from segments.

#     Args:
#         df (pd.DataFrame): DataFrame containing at least 'filename' and 'class_id' columns.
#         source_dir (str): Directory where the original audio (.ogg) files are located.
#         spectrogram_dir (str): Directory to save generated spectrogram .png files.
#         output_csv_path (str): Path to save the output CSV mapping 'filename' to 'class_id'.
#         test_audios_dir (str, optional): Directory to save first 10 segmented audio samples as .wav files for inspection. Defaults to None.
#         noise_reduce (bool, optional): Whether to apply noise reduction to the audio files. Defaults to False.
#         sr (int, optional): Target sampling rate for audio loading. Defaults to 32000.
#         segment_sec (float, optional): Duration (in seconds) of each extracted segment. Defaults to 5.0.
#         threshold_factor (float, optional): Factor to multiply global RMS mean to set segment inclusion threshold. Defaults to 0.5.
#         mels (int, optional): Number of mel bands for the spectrogram. Defaults to 224.
#         hoplen (int, optional): Hop length for the spectrogram. Defaults to 512.
#         nfft (int, optional): FFT window size for the spectrogram. Defaults to 2048.
#     '''

    # samples_per_segment = int(sr * segment_sec)
    # rows = []
    # saved_test_audios = 0  # Counter for test audio samples
    # low_rms = 0  # Counter for segments with low RMS

    # for _, row in df.iterrows():
    #     filename = row['filename']
    #     class_id = row['class_id']
    #     audio_path = os.path.join(source_dir, filename)

    #     try:
    #         y, srate = util.lbrs_loading(audio_path, sr=sr, mono=True)
    #     except:
    #         print(f"Error loading audio file {filename} from {source_dir}. Skipping.")
    #         continue

    #     threshold = util.get_rmsThreshold(y, frame_len=2048, hop_len=hoplen, thresh_factor=0.75)
    #     segments_per_audio = 0  # Counter for segments per audio file

    #     for start in range(0, len(y) - samples_per_segment + 1, samples_per_segment):
    #         if segments_per_audio >= 5:  # Limit to max 4 segments per audio
    #             break
                
    #         segment = y[start:start + samples_per_segment]

    #         seg_rms = np.mean(lbrs.feature.rms(y=segment)[0])
    #         if seg_rms < threshold:
    #             low_rms += 1
    #             continue

#             if noise_reduce:
#                 segment = util.reduce_noise_seg(segment, srate=srate, filename=filename, class_id=class_id)

#             img, spec_path, spec_name = util.get_spec_image(segment, sr=srate, mels=mels, hoplen=hoplen, nfft=nfft,
#                                                             filename=filename, start=start, spectrogram_dir=spectrogram_dir)
#             Image.fromarray(img).save(spec_path)

#             # Save first 10 segmented audios as test samples
#             util.save_test_audios(segment, sr, test_audios_dir, filename, start, saved_test_audios)
#             saved_test_audios += 1

#             rows.append({'filename': spec_name, 'class_id': class_id})
#             segments_per_audio += 1

#     print(f"Total segments removed due to low RMS: {low_rms}")
#     pd.DataFrame(rows).to_csv(output_csv_path, index=False)

In [None]:
def load_spectrograms(segments_df, segments_dir, spectrogram_dir, output_csv_path,
                    test_audios_dir=None, sr=32000, mels=224, hoplen=512, nfft=2048,
                    cap_per_class=100, segment_sec=5.0, threshold_factor=0.75, noise_reduce=True):
    '''
    Creates spectrograms from audio files with balanced sampling per class.
    Cycles through all audios for each time window to ensure fair distribution.
    
    Args:
        segments_df (pd.DataFrame): DataFrame with 'filename' and 'class_id' for audio files.
        segments_dir (str): Directory containing original audio .wav files.
        spectrogram_dir (str): Directory to save generated spectrogram .png files.
        output_csv_path (str): Path to save the output CSV mapping filename to class_id.
        test_audios_dir (str, optional): Directory to copy some segments for testing.
        sr (int): Target sampling rate. Defaults to 32000.
        mels (int): Number of mel bands. Defaults to 224.
        hoplen (int): Hop length for spectrogram. Defaults to 512.
        nfft (int): FFT window size. Defaults to 2048.
        cap_per_class (int): Maximum spectrograms per class_id. Defaults to 100.
        segment_sec (float): Duration of each segment in seconds. Defaults to 5.0.
        threshold_factor (float): RMS threshold factor. Defaults to 0.5.
        noise_reduce (bool): Whether to apply noise reduction. Defaults to False.
    '''
    
    def create_spectrogram_from_segment(audio_segment, class_id, original_filename, segment_index, sr_actual):
        '''
        Auxiliary function to convert audio segment to spectrogram and save it.
        
        Returns:
            dict or None: Record with filename and class_id if successful, None if failed.
        '''
        try:
            # Generate segment filename
            segment_filename = f"{original_filename.split('.')[0]}_seg{segment_index:03d}.wav"
            
            # Apply noise reduction if requested
            if noise_reduce:
                audio_segment = util.reduce_noise_seg(audio_segment, sr=sr_actual, 
                                                    filename=original_filename, class_id=class_id)
            
            # Create spectrogram image
            img, spec_path, spec_name = util.get_spec_image(audio_segment, sr=sr_actual, mels=mels, 
                                                            hoplen=hoplen, nfft=nfft,
                                                            filename=segment_filename, start=0, 
                                                            spectrogram_dir=spectrogram_dir)
            Image.fromarray(img).save(spec_path)
            
            return {
                'filename': spec_name,
                'class_id': class_id,
                'segment_filename': segment_filename,
                'audio_segment': audio_segment,
                'sr': sr_actual
            }
            
        except Exception as e:
            print(f"Error creating spectrogram for segment {segment_index} of {original_filename}: {e}")
            return None

    # Initialize
    os.makedirs(spectrogram_dir, exist_ok=True)
    
    samples_per_segment = int(sr * segment_sec)
    
    # Load all audio files and calculate thresholds
    active_audios = []
    class_counts = {}
    spectrogram_records = []
    saved_test_audios = 0
    
    print("Loading audio files and calculating thresholds...")
    for _, row in segments_df.iterrows():
        filename = row['filename']
        class_id = row['class_id']
        audio_path = os.path.join(segments_dir, filename)
        
        try:
            y, srate = util.lbrs_loading(audio_path, sr=sr, mono=True)
            threshold = util.get_rmsThreshold(y, frame_len=2048, hop_len=hoplen, thresh_factor=threshold_factor)
            max_segments = len(y) // samples_per_segment
            
            if max_segments > 0:
                active_audios.append({
                    'audio_data': y,
                    'class_id': class_id,
                    'filename': filename,
                    'max_segments': max_segments,
                    'threshold': threshold,
                    'sr': srate
                })
                
                if class_id not in class_counts:
                    class_counts[class_id] = 0
                    
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            continue
    
    print(f"Loaded {len(active_audios)} audio files successfully.")
    print(f"Creating spectrograms with cap of {cap_per_class} per class...")
    
    # Cycle through segments
    segment_index, aux = 0, 0
    
    while active_audios:
        aux += 1
        if aux < 5:
            print(f"Processing segment window {segment_index} ({segment_index * segment_sec}s - {(segment_index + 1) * segment_sec}s)")
        
        audios_to_remove = []
        
        for i, audio_info in enumerate(active_audios):
            class_id = audio_info['class_id']
            
            # Check if this class has reached its cap
            if class_counts[class_id] >= cap_per_class:
                audios_to_remove.append(i)
                continue
            
            # Check if this audio has enough data for current segment
            if segment_index >= audio_info['max_segments']:
                audios_to_remove.append(i)
                continue
            
            # Extract segment
            start_sample = segment_index * samples_per_segment
            end_sample = start_sample + samples_per_segment
            segment = audio_info['audio_data'][start_sample:end_sample]
            
            # Check RMS threshold
            seg_rms = np.mean(lbrs.feature.rms(y=segment)[0])
            if seg_rms < audio_info['threshold']:
                continue
            
            # Create spectrogram from segment
            record = create_spectrogram_from_segment(
                segment, class_id, audio_info['filename'], segment_index, audio_info['sr']
            )
            
            if record is not None:
                # Save test audio if requested
                if test_audios_dir and saved_test_audios < 10:
                    temp_segment_path = os.path.join(test_audios_dir, record['segment_filename'])
                    sf.write(temp_segment_path, record['audio_segment'], record['sr'])
                    saved_test_audios += 1
                
                # Remove temporary fields and add to records
                del record['segment_filename']
                del record['audio_segment'] 
                del record['sr']
                spectrogram_records.append(record)
                
                class_counts[class_id] += 1
                if aux < 5:
                    print(f"Created spectrogram for {audio_info['filename']} seg{segment_index} (Class {class_id}: {class_counts[class_id]}/{cap_per_class})")
        
        # Remove audios that are done
        for i in sorted(audios_to_remove, reverse=True):
            removed_audio = active_audios.pop(i)
            r_class = removed_audio['class_id']
            reason = f"cap reached for {r_class}" if class_counts[r_class] >= cap_per_class else "audio exhausted"
            print(f"Popped {removed_audio['filename']} from processing ({reason})")
        
        segment_index += 1
        
        # Break if no more audios to process
        if not active_audios:
            break
    
    # Save the final CSV
    final_df = pd.DataFrame(spectrogram_records)
    final_df.to_csv(output_csv_path, index=False)
    
    print(f"\nSpectrogram generation complete!")
    print(f"Total spectrograms created: {len(final_df)}")
    print("Spectrograms per class:")
    for class_id, count in class_counts.items():
        print(f"  Class {class_id}: {count}")
    print(f"Output saved to: {output_csv_path}")
    
    return final_df

In [4]:
# Define Paths
audios_dir = os.path.join('..', 'database', 'audio', 'dev')
dev_data = pd.read_csv(os.path.join('..', 'database', 'meta', 'dev_data.csv'))

spect_dir = os.path.join('..', 'database', 'spect')
spect2_dir = os.path.join('..', 'database', 'spect2')

output_csv = os.path.join('..', 'database', 'meta', 'final_spects.csv')
output_csv2 = os.path.join('..', 'database', 'meta', 'final_spects2.csv')

test_audios_dir = os.path.join('..', 'database', 'test_audios')
test_audios2_dir = os.path.join('..', 'database', 'test_audios2')

# Solo los que estan en dev/ para no encontrar errores
dev_df = dev_data

In [5]:
util.clean_dir(spect_dir)
util.clean_dir(test_audios_dir)

print("Generating spectrograms from audio segments...")
load_spectrograms(dev_df, audios_dir, spect_dir, output_csv, test_audios_dir=test_audios_dir, noise_reduce=True)

Resetting ..\database\spect directory...
Resetting ..\database\test_audios directory...
Generating spectrograms from audio segments...
Total segments removed due to low RMS: 419


Aca veo los tamaños

In [None]:
import random
amount = 20

# List all PNG files in spect_dir
spect_files = [f for f in os.listdir(spect_dir) if f.endswith('.png')]

# Fetch 5 random images
random_files = random.sample(spect_files, min(amount, len(spect_files)))

for fname in random_files:
    img_path = os.path.join(spect_dir, fname)
    img = Image.open(img_path)
    print(f"{fname}: {img.size}")

XC616305_160000.png: (313, 224)
XC214240_0.png: (313, 224)
XC390971_320000.png: (313, 224)
XC48747_160000.png: (313, 224)
XC288823_960000.png: (313, 224)
XC394268_0.png: (313, 224)
XC396786_320000.png: (313, 224)
XC173759_640000.png: (313, 224)
XC112712_0.png: (313, 224)
XC283328_160000.png: (313, 224)
XC503247_480000.png: (313, 224)
XC476264_0.png: (313, 224)
XC492619_320000.png: (313, 224)
XC48762_1120000.png: (313, 224)
XC51046_320000.png: (313, 224)
XC139503_160000.png: (313, 224)
XC204203_640000.png: (313, 224)
XC361108_800000.png: (313, 224)
XC32322_0.png: (313, 224)
XC51744_1280000.png: (313, 224)
