In [18]:
# Imports
import os
import random
import logging


import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing

# Data processing and scientific computing
from scipy.io import wavfile
from scipy.signal import butter, lfilter
from scipy.spatial.distance import cosine

# Audio processing
import librosa
import soundfile as sf

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# Set up logging
logging.basicConfig(level=logging.INFO)

### Data Cleansing
    - Remove low-quality or noisy recordings based on metadata if available, or by manually inspecting a few samples.
	    - It's a good idea to manually inspect some of the longer clips, especially outliers.
	    - - Listen for: a) Consistency of the bird call throughout the clip b) Presence of long periods of silence c) Sudden changes in background noise or environment
		- If you find issues, you might consider trimming these clips to the most relevant sections before segmenting.
		- - Definitely inspect a sample of low-rated files.
		- Listen for: a) Clarity of the bird call b) Signal-to-noise ratio (how clear the bird call is compared to background noise) c) Presence of distortions or artifacts
		- Consider setting a threshold for the quality rating, below which you might exclude files from your dataset.
		- Determining if a file is too "noisy":

		- This can be subjective, but here are some approaches: a) Signal-to-Noise Ratio (SNR): Calculate the SNR for each file. Files below a certain threshold could be considered too noisy. b) Spectral analysis: Look at the spectrogram. A very noisy file will have a lot of energy spread across all frequencies. c) Perceptual evaluation: Listen to a sample and rate them yourself. This can help you calibrate your automatic methods.

In [19]:
data = pd.read_csv('ohio_bird_recordings_metadata.csv')

In [20]:
data['quality'].value_counts()

quality
A           267
B           261
C           147
D            48
no score     16
E             6
Name: count, dtype: int64

In [21]:
data = data[~((data['common_name'] == 'Identity unknown') & (data['species'] == 'mystery'))]

In [22]:

low_quality_files = data[(data['quality'] == 'E') | (data['quality'] == 'no score')]
display(low_quality_files)

Unnamed: 0,id,genus,species,common_name,latitude,longitude,type,date,time,length,quality,remarks,sex,stage,also,file_name,local_file
177,678953,Vireo,bellii,Bell's Vireo,39.2095,-84.7821,song,2015-06-02,17:40,0:03,E,Originally recorded as a video and uploaded to...,male,adult,,Vireo_bellii_Whitewater_Township_near__Harriso...,Original Recordings\Vireo_bellii_Whitewater_To...
192,376462,Baeolophus,bicolor,Tufted Titmouse,40.942,-81.5236,call,2017-06-21,07:00,0:15,no score,Odd call from a Tufted Titmouse,,,Corvus brachyrhynchos,Baeolophus_bicolor_Ohio_near__Akron_Summit_Cou...,Original Recordings\Baeolophus_bicolor_Ohio_ne...
221,475639,Tachycineta,bicolor,Tree Swallow,41.9682,-82.5305,call,2019-05-17,5:29,0:02,no score,,,,,Tachycineta_bicolor_Pelee_near__Leamington_Ess...,Original Recordings\Tachycineta_bicolor_Pelee_...
358,17172,Spizella,passerina,Chipping Sparrow,41.93338,-83.54994,song,2007-07-16,?,0:28,E,Two birds singing,,,Zenaida macroura,Spizella_passerina_Michigan_Monroe_County_1717...,Original Recordings\Spizella_passerina_Michiga...
359,554398,Spizella,pusilla,Field Sparrow,39.8888,-82.7978,song,2020-05-03,15:30,0:20,no score,"Habitat was mostly open field / wetlands, with...",uncertain,uncertain,,Spizella_pusilla_Madison_Township_near__Canal_...,Original Recordings\Spizella_pusilla_Madison_T...
389,17143,Melospiza,melodia,Song Sparrow,41.93338,-83.54994,song,2007-07-18,?,0:34,E,Traffic noise in background.,,,"Quiscalus quiscula, Passer domesticus",Melospiza_melodia_Michigan_Monroe_County_17143...,Original Recordings\Melospiza_melodia_Michigan...
553,374227,Geothlypis,trichas,Common Yellowthroat,41.1895,-81.5781,song,2017-06-07,07:30,0:30,E,,,,"Melanerpes carolinus, Contopus virens, Baeolop...",Geothlypis_trichas_Ohio_near__Peninsula_Summit...,Original Recordings\Geothlypis_trichas_Ohio_ne...
676,560021,Sonus,naturalis,Soundscape,41.433,-81.418,song,2020-05-20,14:45,0:28,no score,,uncertain,uncertain,,Sonus_naturalis_Chagrin_Falls_Township_near__M...,Original Recordings\Sonus_naturalis_Chagrin_Fa...


### Convert mp3 to wav

In [26]:

def convert_mp3_to_wav(mp3_path, wav_path):
    """
    Convert an MP3 file to WAV format using librosa and soundfile.
    
    Args:
    mp3_path (str): Path to the input MP3 file
    wav_path (str): Path to save the output WAV file
    
    Returns:
    str: Path to the created WAV file
    """
    # Load the mp3 file
    audio, sr = librosa.load(mp3_path, sr=None, mono=False)
    
    # Save as wav
    sf.write(wav_path, audio.T, sr)
    
    return wav_path

def batch_convert_to_wav(data, input_dir, output_dir):
    """
    Convert all MP3 files in the dataset to WAV format.
    
    Args:
    data (pd.DataFrame): DataFrame containing file information
    input_dir (str): Directory containing the input MP3 files
    output_dir (str): Directory to save the output WAV files
    
    Returns:
    pd.DataFrame: Updated DataFrame with new file paths
    """
    os.makedirs(output_dir, exist_ok=True)
    
    new_data = data.copy()
    for index, row in new_data.iterrows():
        mp3_path = os.path.join(input_dir, row['file_name'])
        wav_filename = os.path.splitext(row['file_name'])[0] + '.wav'
        wav_path = os.path.join(output_dir, wav_filename)
        
        convert_mp3_to_wav(mp3_path, wav_path)
        new_data.at[index, 'file_name'] = wav_filename
    
    return new_data

In [25]:
data['file_name'].value_counts()

file_name
Branta_canadensis_Whitewater_Township_near__Harrison_Hamilton_County_Ohio_726750.mp3    1
Molothrus_ater_Pelee_near__Leamington_Essex_County_Ontario_476558.mp3                   1
Agelaius_phoeniceus_Ross_Township_near__Hamilton_Butler_County_Ohio_533370.mp3          1
Agelaius_phoeniceus_Crosby_Township_near__Harrison_Hamilton_County_Ohio_482675.mp3      1
Agelaius_phoeniceus_Case_Farm_Gates_Mills_Ohio_98722.mp3                                1
                                                                                       ..
Progne_subis_Magee_Marsh_Ohio_164751.mp3                                                1
Progne_subis_Magee_Marsh_Ohio_164748.mp3                                                1
Progne_subis_Whitewater_Township_near__Cleves_Hamilton_County_Ohio_817716.mp3           1
Hirundo_rustica_Green_Township_near__Cincinnati_Hamilton_County_Ohio_833706.mp3         1
Sonus_naturalis_Maumee_Bay_State_Park_Lucas_County_Ohio_821133.mp3                      1


In [27]:
original_dir = 'Original Recordings'
converted_dir = 'Converted Recordings'

# Convert the MP3 files to WAV
print("Converting MP3 files to WAV...")
converted_data = batch_convert_to_wav(data, original_dir, converted_dir)

# Print summary
print(f"Conversion complete. {len(converted_data)} files converted.")
print(f"WAV files saved in: {converted_dir}")


Converting MP3 files to WAV...


  audio, sr = librosa.load(mp3_path, sr=None, mono=False)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'Original Recordings\\Colaptes_auratus_Miami_Township_near__North_Bend_Hamilton_County_Ohio_713588.mp3'

### Listen to the low quality and no score graded files.
1 - bad
2 - fine
3 - fine
4 - 

"""
Audio Cleaning Functions

These functions collectively clean an audio file by:
1. Calculating its signal-to-noise ratio (SNR) and filtering out audio that is too noisy.
2. Detecting and trimming long silences from the audio.
3. Checking for spectral spread, which is an indicator of unwanted noise or anomalies.

Main function:
- `clean_audio`: Uses `is_too_noisy`, `has_long_silence`, and `check_spectral_spread` to decide if an audio file is suitable for further processing.
"""

- **Feature Extraction with Librosa**:
    - Extract features like **Mel-spectrograms** and **MFCCs** from each audio file. These features are effective for audio classification tasks.
    - Store these features as images (for CNN input) or numerical arrays (for models like Random Forest or RNNs).

    - **Audio Standardization**:
    - Convert all files to a consistent format (e.g., 16-bit WAV, mono-channel, and a sampling rate like 16 kHz).
- **Clip Standardization**:
    - Trim or pad each audio clip to a standard duration (e.g., 5 seconds), so all inputs have the same shape.

In [10]:
def calculate_snr(audio):
    """Calculate the signal-to-noise ratio of an audio clip."""
    signal = np.mean(audio**2)
    noise = np.mean((audio - np.mean(audio))**2)
    snr = 10 * np.log10(signal / noise)
    return snr

def is_too_noisy(audio, sr, threshold=-20):
    """Check if audio is too noisy based on its SNR."""
    snr = calculate_snr(audio)
    return snr < threshold

def has_long_silence(audio, sr, silence_threshold=-60, min_silence_duration=1.0):
    """Detects long silences within the audio clip."""
    intervals = librosa.effects.split(audio, top_db=-silence_threshold)
    if len(intervals) > 1:
        silence_durations = np.diff(intervals.ravel())[1::2] / sr
        return np.any(silence_durations >= min_silence_duration)
    return False

def check_spectral_spread(audio, sr, threshold=0.8):
    """Check if the spectral spread exceeds the specified threshold."""
    spec = np.abs(librosa.stft(audio))
    spectral_spread = np.sum(spec > np.mean(spec)) / spec.size
    return spectral_spread > threshold

def clean_audio(audio, sr, file_path, shared_discarded_files):
    """Cleans an audio file by removing noise, silence, and checking for spectral spread."""
    # Get file name for logging
    file_name = os.path.basename(file_path)

    # Check noise level
    if is_too_noisy(audio, sr):
        shared_discarded_files.append({'file_path': file_path, 'reason': 'too_noisy', 'snr': calculate_snr(audio)})
        return None
    
    # Check for long silences
    if has_long_silence(audio, sr):
        audio = librosa.effects.trim(audio, top_db=20)[0]
    
    # Check spectral spread
    if check_spectral_spread(audio, sr):
        shared_discarded_files.append({'file_path': file_path, 'reason': 'bad_spectral_spread'})
        return None
    
    return audio

In [11]:
# Checking for duplicates or near-duplicates

def get_audio_fingerprint(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def are_near_duplicates(audio1, sr1, audio2, sr2, threshold=0.95):
    if len(audio1) == 0 or len(audio2) == 0:
        raise ValueError("One or both audio files are empty")
    
    if sr1 != sr2:
        print(f"Warning: Sample rates differ ({sr1} vs {sr2}). Resampling may be necessary.")
    
    fp1 = get_audio_fingerprint(audio1, sr1)
    fp2 = get_audio_fingerprint(audio2, sr2)
    
    if len(fp1) != len(fp2):
        raise ValueError("Fingerprints have different lengths")
    
    similarity = 1 - cosine(fp1, fp2)
    return similarity > threshold


In [13]:
def process_audio(file_path, shared_duplicates, shared_discarded_files, target_length=5, overlap=0.5, target_sr=44100):
    try:
        # Load and clean the audio
        audio, sr = librosa.load(file_path, sr=None)
        audio = clean_audio(audio, sr, file_path, shared_discarded_files)
        if audio is None:
            return None, None
        
        # Check for duplicates before processing
        for existing_audio, existing_sr, existing_path in shared_duplicates:
            if are_near_duplicates(audio, sr, existing_audio, existing_sr):
                shared_discarded_files.append((file_path, 'duplicate'))
                return None, None

        # Resample if necessary
        if sr != target_sr:
            audio = librosa.resample(audio, sr, target_sr)
            sr = target_sr
            
        # Convert target_length to samples
        target_samples = sr * target_length
        
        # If audio is shorter than target length, pad with zeros
        if len(audio) < target_samples:
            audio = librosa.util.fix_length(audio, target_samples)
        
        # If audio is longer than target length, segment with overlap
        else:
            segments = []
            for start in range(0, len(audio), int(target_samples * (1 - overlap))):
                segment = audio[start:start + target_samples]
                if len(segment) == target_samples:
                    segments.append(segment)
                elif len(segment) > 0:  # Handle the last segment if it's shorter
                    segment = librosa.util.fix_length(segment, target_samples)
                    segments.append(segment)
            audio = np.array(segments)
        
        # Store the fingerprint of the processed audio to check against future files
        shared_duplicates.append((audio, sr, file_path))

        return audio, sr
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
        shared_discarded_files.append((file_path, f'error: {str(e)}'))
        return None, None

In [14]:
def process_file(args):
    row, audio_dir, output_dir, shared_duplicates, shared_discarded_files = args
    file_path = os.path.join(audio_dir, row['file_name'])
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return None
    
    processed_audio, sr = process_audio(file_path, shared_duplicates, shared_discarded_files)
    if processed_audio is None:
        return None
    
    processed_data = []
    if processed_audio.ndim == 2:
        for i, segment in enumerate(processed_audio):
            new_row = row.copy()
            base_filename = f"{os.path.splitext(row['file_name'])[0]}_segment_{i}"
            new_row['processed_file'] = f"{base_filename}.wav"
            wavfile.write(os.path.join(output_dir, new_row['processed_file']), sr, segment)
            processed_data.append(new_row)
    else:
        base_filename = f"{os.path.splitext(row['file_name'])[0]}_processed"
        row['processed_file'] = f"{base_filename}.wav"
        wavfile.write(os.path.join(output_dir, row['processed_file']), sr, processed_audio)
        processed_data.append(row)
    
    return processed_data

In [15]:
def process_dataset(converted_data, audio_dir, output_dir, n_processes=4):
    manager = multiprocessing.Manager()
    shared_duplicates = manager.list()
    shared_discarded_files = manager.list()

    with Pool(n_processes) as p:
        results = list(tqdm(p.imap(
            process_file, 
            [(row, audio_dir, output_dir, shared_duplicates, shared_discarded_files) 
             for _, row in converted_data.iterrows()]), 
            total=len(converted_data)))
    
    processed_data = [item for sublist in results if sublist is not None for item in sublist]

    # Save discarded files to a DataFrame and export as CSV
    discard_log_df = pd.DataFrame(list(shared_discarded_files), columns=['file_path', 'reason'])
    discard_log_df.to_csv('discarded_audio_log.csv', index=False)

    return pd.DataFrame(processed_data)

In [None]:
# Main execution
processed_dir = 'Processed Recordings'
processed_data = process_dataset(converted_data, converted_dir, processed_dir)
print('Audio Processing Complete')

# Print completion message and count files in Processed Recordings directory
processed_file_count = len([f for f in os.listdir(processed_dir) if f.endswith('.wav')])
print(f"\nAudio processing is complete. There are now {processed_file_count} files in the '{processed_dir}' directory.")

# Output discarded files
discard_log_df = pd.read_csv('discarded_audio_log.csv')
if not discard_log_df.empty:
    print("\nThe following files were discarded:")
    for _, row in discard_log_df.iterrows():
        print(f"{row['file_path']}: {row['reason']}")
else:
    print("\nNo files were discarded.")

In [None]:
print(processed_data.info())
processed_data.head()

### Data Augmentation
### Augment the processed audio files.
- Pitch Shift
- Time Stretch
- add_noise
- change_speed
- apply_filter
- add_background
- time_shift
- augment_audio
- 

In [None]:

def generate_wind_sound(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    wind = np.random.normal(0, 0.1, int(sr * duration))
    wind_filtered = np.convolve(wind, np.ones(1000)/1000, mode='same')
    return wind_filtered / np.max(np.abs(wind_filtered))

def generate_leaf_rustle(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    leaf = np.random.normal(0, 0.1, int(sr * duration))
    envelope = np.exp(-t * 10) * np.sin(2 * np.pi * 2 * t)**2
    return leaf * envelope / np.max(np.abs(leaf * envelope))

def generate_water_sound(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    water = np.random.normal(0, 0.1, int(sr * duration))
    water_filtered = np.convolve(water, np.ones(500)/500, mode='same')
    ripple = np.sin(2 * np.pi * 2 * t) * np.exp(-t * 2)
    return (water_filtered + ripple) / np.max(np.abs(water_filtered + ripple))

def mix_nature_sounds(duration, sr):
    wind = generate_wind_sound(duration, sr)
    leaf = generate_leaf_rustle(duration, sr)
    water = generate_water_sound(duration, sr)
    
    mix = wind * 0.7 + leaf * 0.2 + water * 0.1
    return mix / np.max(np.abs(mix))

# Generate a 5-second mix of nature-like sounds
sr = 44100
duration = 5
nature_background = mix_nature_sounds(duration, sr)

In [None]:

def pitch_shift(audio, sr, n_steps):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def time_stretch(audio, rate):
    return librosa.effects.time_stretch(audio, rate=rate)

def add_noise(audio, noise_factor):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    return np.clip(augmented_audio, -1, 1)

def change_speed(audio, speed_factor):
    return librosa.effects.time_stretch(audio, rate=1/speed_factor)

def apply_filter(audio, sr, filter_type='lowpass', cutoff=1000):
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = butter(4, normal_cutoff, btype=filter_type, analog=False)
    return lfilter(b, a, audio)

def add_background(audio, background, ratio=0.1):
    if len(background) > len(audio):
        start = np.random.randint(0, len(background) - len(audio))
        background = background[start:start+len(audio)]
    else:
        background = np.pad(background, (0, max(0, len(audio) - len(background))))
    return audio + ratio * background

def time_shift(audio, shift_max, roll_prob=0.5):
    shift = np.random.randint(-shift_max, shift_max)
    if random.random() < roll_prob:
        return np.roll(audio, shift)
    else:
        if shift > 0:
            return np.pad(audio, (shift, 0))[:len(audio)]
        else:
            return np.pad(audio, (0, -shift))[:-shift]

def augment_audio(audio, sr):
    augmentations = [
        (pitch_shift, {'n_steps': random.uniform(-2, 2)}),
        (time_stretch, {'rate': random.uniform(0.8, 1.2)}),
        (add_noise, {'noise_factor': random.uniform(0.001, 0.015)}),
        (change_speed, {'speed_factor': random.uniform(0.9, 1.1)}),
        (apply_filter, {'filter_type': random.choice(['lowpass', 'highpass']),
                        'cutoff': random.uniform(1000, 4000)}),
        (time_shift, {'shift_max': int(sr * 0.5)})
    ]
    
    # Randomly select 2-4 augmentations
    num_augments = random.randint(2, 4)
    selected_augments = random.sample(augmentations, num_augments)
    
    applied_augmentations = []
    
    # Apply selected augmentations
    for augment_func, params in selected_augments:
        audio = augment_func(audio, sr, **params)
        applied_augmentations.append(f"{augment_func.__name__}:{','.join(f'{k}={v}' for k, v in params.items())}")
    
    # Add synthesized nature background
    if random.random() < 0.5:
        nature_background = mix_nature_sounds(len(audio) / sr, sr)
        ratio = random.uniform(0.1, 0.3)
        audio = add_background(audio, nature_background, ratio=ratio)
        applied_augmentations.append(f"add_background:ratio={ratio:.2f}")
    
    return audio, applied_augmentations



In [None]:

def augment_and_save(input_file, output_dir, num_augmentations=3):
    try:
        # Load the audio file
        audio, sr = librosa.load(input_file, sr=None)
        
        augmented_files = []
        all_applied_augmentations = []
        
        for i in range(num_augmentations):
            # Apply augmentation
            augmented_audio, applied_augmentations = augment_audio(audio, sr)
            
            # Generate new filename
            base_name = os.path.basename(input_file)
            name, ext = os.path.splitext(base_name)
            new_name = f"{name}_aug_{i+1}{ext}"
            output_path = os.path.join(output_dir, new_name)
            
            # Save augmented audio
            sf.write(output_path, augmented_audio, sr)
            
            augmented_files.append(output_path)
            all_applied_augmentations.append(';'.join(applied_augmentations))
        
        return augmented_files, all_applied_augmentations
    
    except Exception as e:
        print(f"Error processing {input_file}: {str(e)}")
        return [], []


def process_dataframe(df, input_dir, output_dir, num_augmentations=3):
    new_rows = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"):
        input_file = os.path.join(input_dir, row['file_path'])
        augmented_files, augmentations = augment_and_save(input_file, output_dir, num_augmentations)
        
        for aug_file, aug_details in zip(augmented_files, augmentations):
            new_row = row.copy()
            new_row['file_path'] = os.path.relpath(aug_file, output_dir)
            new_row['augmentations'] = aug_details
            new_rows.append(new_row)
    
    augmented_df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return augmented_df

# Usage
input_dir = 'Processed Recordings'
output_dir = 'Augmented Recordings'
num_augmentations = 3


# Process the dataframe
augmented_data = process_dataframe(processed_data, input_dir, output_dir, num_augmentations)

print(f"Augmentation complete. {len(augmented_data) - len(processed_data)} new samples created.")

### Extract features from the processed and augmented audio files.

In [None]:
# Drop original recordings from dataset for feature extraction
def filter_dataset(df, keep_dirs=['Processed Recordings', 'Augmented Recordings']):
    # Convert keep_dirs to a set for faster lookup
    keep_dirs_set = set(keep_dirs)
    
    # Function to check if a file path is in one of the keep_dirs
    def is_keep_dir(file_path):
        return any(dir_name in file_path for dir_name in keep_dirs_set)
    
    # Filter the dataframe
    filtered_df = df[df['file_path'].apply(lambda x: is_keep_dir(x))]
    
    # Reset the index of the filtered dataframe
    filtered_df = filtered_df.reset_index(drop=True)
    
    # Print some information about the filtering process
    print(f"Original dataset size: {len(df)}")
    print(f"Filtered dataset size: {len(filtered_df)}")
    print(f"Removed {len(df) - len(filtered_df)} entries")
    
    return filtered_df

# Usage
filtered_data = filter_dataset(augmented_data)

In [None]:

def extract_features(audio, sr):
    # Mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    
    # Spectral Centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    
    # Chroma Features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    
    # Zero Crossing Rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)[0]
    
    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    
    return {
        'mel_spectrogram_db': mel_spec_db,
        'mfccs': mfccs,
        'spectral_centroids': spectral_centroids,
        'chroma': chroma,
        'zero_crossing_rate': zero_crossing_rate,
        'spectral_rolloff': spectral_rolloff
    }

def summarize_feature(feature):
    if feature.ndim == 1:
        return [np.mean(feature), np.std(feature), np.max(feature)]
    elif feature.ndim == 2:
        return np.hstack([
            np.mean(feature, axis=1),
            np.std(feature, axis=1),
            np.max(feature, axis=1)
        ])
    
def save_mel_spectrogram(mel_spec, output_dir, base_filename, sr):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-spectrogram')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{base_filename}_mel_spectrogram.png"))
    plt.close()
    

In [None]:
def process_audio_file(file_path, output_dir):
    try:
        audio, _ = librosa.load(file_path, sr=sr)
        features = extract_features(audio, sr)
        
        # Summarize features
        feature_summary = {}
        for key, value in features.items():
            if key != 'mel_spectrogram':
                feature_summary[f"{key}_summary"] = summarize_feature(value)
        
        # Create feature vector
        feature_vector = np.hstack([
            feature_summary.get('mfccs_summary', np.array([])),
            feature_summary.get('spectral_centroids_summary', np.array([])),
            feature_summary.get('chroma_summary', np.array([])),
            feature_summary.get('zero_crossing_rate_summary', np.array([])),
            feature_summary.get('spectral_rolloff_summary', np.array([]))
            ])
        
        # Save mel-spectrogram as image
        base_filename = os.path.splitext(os.path.basename(file_path))[0]
        save_mel_spectrogram(features['mel_spectrogram'], output_dir, base_filename)
        
        return feature_vector, features
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None, None

# Main processing loop
def process_audio_files(filtered_data, output_dir):
    feature_data = []
    
    for _, row in tqdm(filtered_data.iterrows(), total=len(filtered_data)):
        file_path = row['file_name']
        feature_vector, full_features = process_audio_file(file_path, output_dir)
        
        if feature_vector is not None and full_features is not None:
            feature_dict = {
                'file_name': file_path,
                'feature_vector': feature_vector,
            }
            
            # Add full feature arrays
            for key, value in full_features.items():
                feature_dict[f"{key}_full"] = value
            
            feature_data.append(feature_dict)
    
    return feature_data

# Main execution
output_dir = 'mel-spectrograms'
final_data = process_audio_files(filtered_data, output_dir)

# Convert to DataFrame
df = pd.DataFrame([{'file_name': item['file_name'], 'feature_vector': item['feature_vector']} for item in final_data])

# Save DataFrame to CSV
df.to_csv('final_data.csv', index=False)

print("Processing complete. Summary data saved to 'final_data.csv'.")