In [1]:
# Imports
import os
import random
import logging


import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing

# Data processing and scientific computing
from scipy.io import wavfile
from scipy.signal import butter, lfilter
from scipy.spatial.distance import cosine

# Audio processing
import librosa
import soundfile as sf

# Visualization
import matplotlib.pyplot as plt
import plotly.express as px

# Set up logging
logging.basicConfig(level=logging.INFO)

### Data Cleansing
    - Remove low-quality or noisy recordings based on metadata if available, or by manually inspecting a few samples.
	    - It's a good idea to manually inspect some of the longer clips, especially outliers.
	    - - Listen for: a) Consistency of the bird call throughout the clip b) Presence of long periods of silence c) Sudden changes in background noise or environment
		- If you find issues, you might consider trimming these clips to the most relevant sections before segmenting.
		- - Definitely inspect a sample of low-rated files.
		- Listen for: a) Clarity of the bird call b) Signal-to-noise ratio (how clear the bird call is compared to background noise) c) Presence of distortions or artifacts
		- Consider setting a threshold for the quality rating, below which you might exclude files from your dataset.
		- Determining if a file is too "noisy":

		- This can be subjective, but here are some approaches: a) Signal-to-Noise Ratio (SNR): Calculate the SNR for each file. Files below a certain threshold could be considered too noisy. b) Spectral analysis: Look at the spectrogram. A very noisy file will have a lot of energy spread across all frequencies. c) Perceptual evaluation: Listen to a sample and rate them yourself. This can help you calibrate your automatic methods.

In [2]:
data = pd.read_csv('preprocessed_data.csv')

In [3]:
data.columns

Index(['genus', 'species', 'latitude', 'longitude', 'type', 'quality',
       'file_name', 'simplified_type', 'season', 'time_of_day',
       'length_seconds'],
      dtype='object')

### The metadata contains a column with a 'quality' rating. I'll manually examine those entries.

In [4]:
data['quality'].value_counts()

quality
A           274
B           262
C           116
D            32
E             4
no score      4
Name: count, dtype: int64

In [5]:

low_quality_files = data[(data['quality'] == 'E') | (data['quality'] == 'no score')]
display(low_quality_files)

Unnamed: 0,genus,species,latitude,longitude,type,quality,file_name,simplified_type,season,time_of_day,length_seconds
182,Vireo,bellii,39.2095,-84.7821,song,E,Vireo_bellii_Whitewater_Township_near__Harriso...,Song,Summer,Evening,3
197,Baeolophus,bicolor,40.942,-81.5236,call,no score,Baeolophus_bicolor_Ohio_near__Akron_Summit_Cou...,Call,Summer,Morning,15
227,Tachycineta,bicolor,41.9682,-82.5305,call,no score,Tachycineta_bicolor_Pelee_near__Leamington_Ess...,Call,Spring,Morning,2
364,Spizella,passerina,41.93338,-83.54994,song,E,Spizella_passerina_Michigan_Monroe_County_1717...,Song,Summer,Unknown,28
365,Spizella,pusilla,39.8888,-82.7978,song,no score,Spizella_pusilla_Madison_Township_near__Canal_...,Song,Spring,Afternoon,20
395,Melospiza,melodia,41.93338,-83.54994,song,E,Melospiza_melodia_Michigan_Monroe_County_17143...,Song,Summer,Unknown,34
563,Geothlypis,trichas,41.1895,-81.5781,song,E,Geothlypis_trichas_Ohio_near__Peninsula_Summit...,Song,Summer,Morning,30
687,Sonus,naturalis,41.433,-81.418,song,no score,Sonus_naturalis_Chagrin_Falls_Township_near__M...,Song,Spring,Afternoon,28


### Convert mp3 to wav

In [6]:
def convert_mp3_to_wav(mp3_path, wav_path):
    """
    Convert an MP3 file to WAV format using librosa and soundfile.
    
    Args:
    mp3_path (str): Path to the input MP3 file
    wav_path (str): Path to save the output WAV file
    
    Returns:
    str: Path to the created WAV file
    
    Raises:
    FileNotFoundError: If the input MP3 file is not found
    """
    if not os.path.exists(mp3_path):
        raise FileNotFoundError(f"MP3 file not found: {mp3_path}")
    
    # Load the mp3 file
    audio, sr = librosa.load(mp3_path, sr=None, mono=False)
    
    # Save as wav
    sf.write(wav_path, audio.T, sr)
    
    return wav_path

def batch_convert_to_wav(data, input_dir, output_dir):
    """
    Convert all MP3 files in the dataset to WAV format.
    
    Args:
    data (pd.DataFrame): DataFrame containing file information
    input_dir (str): Directory containing the input MP3 files
    output_dir (str): Directory to save the output WAV files
    
    Returns:
    tuple: (pd.DataFrame, list) Updated DataFrame with new file paths and list of files not found
    """
    os.makedirs(output_dir, exist_ok=True)
    
    new_data = data.copy()
    files_not_found = []
    
    for index, row in new_data.iterrows():
        mp3_path = os.path.join(input_dir, row['file_name'])
        wav_filename = os.path.splitext(row['file_name'])[0] + '.wav'
        wav_path = os.path.join(output_dir, wav_filename)
        
        try:
            convert_mp3_to_wav(mp3_path, wav_path)
            new_data.at[index, 'file_name'] = wav_filename
        except FileNotFoundError:
            files_not_found.append(row['file_name'])
            new_data = new_data.drop(index)
    
    return new_data.reset_index(drop=True), files_not_found

In [7]:
original_dir = 'Original Recordings'
converted_dir = 'Converted Recordings'

# Convert the MP3 files to WAV
print("Converting MP3 files to WAV...")
converted_data, missing_files = batch_convert_to_wav(data, original_dir, converted_dir)

# Print summary
print(f"Conversion complete. {len(converted_data)} files converted.")
print(f"WAV files saved in: {converted_dir}")
print(f"Number of files not found: {len(missing_files)}")

# If you want to examine the list of missing files
print("List of files not found:")
print(missing_files)

Converting MP3 files to WAV...
Conversion complete. 691 files converted.
WAV files saved in: Converted Recordings
Number of files not found: 1
List of files not found:
['Colaptes_auratus_Miami_Township_near__North_Bend_Hamilton_County_Ohio_713588.mp3']


"""
Audio Cleaning Functions

These functions collectively clean an audio file by:
1. Calculating its signal-to-noise ratio (SNR) and filtering out audio that is too noisy.
2. Detecting and trimming long silences from the audio.
3. Checking for spectral spread, which is an indicator of unwanted noise or anomalies.

Main function:
- `clean_audio`: Uses `is_too_noisy`, `has_long_silence`, and `check_spectral_spread` to decide if an audio file is suitable for further processing.
"""

- **Feature Extraction with Librosa**:
    - Extract features like **Mel-spectrograms** and **MFCCs** from each audio file. These features are effective for audio classification tasks.
    - Store these features as images (for CNN input) or numerical arrays (for models like Random Forest or RNNs).

    - **Audio Standardization**:
    - Convert all files to a consistent format (e.g., 16-bit WAV, mono-channel, and a sampling rate like 16 kHz).
- **Clip Standardization**:
    - Trim or pad each audio clip to a standard duration (e.g., 5 seconds), so all inputs have the same shape.

### Start with resampling so every file has the same sample rate

In [8]:

def resample_audio(file_path, target_sr=44100):
    try:
        # Load the audio file
        audio, sr = librosa.load(file_path, sr=None)
        
        # Resample if necessary
        if sr != target_sr:
            audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr)
            
            # Overwrite the original file
            sf.write(file_path, audio, target_sr)
            
            return True  # Indicate that resampling was performed
        else:
            return False  # Indicate that no resampling was needed
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
        return None

def resample_all_files(directory='Converted Recordings', target_sr=44100):
    total_files = 0
    resampled_files = 0
    error_files = 0
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):  # Assuming we're only processing .wav files
            total_files += 1
            file_path = os.path.join(directory, filename)
            result = resample_audio(file_path, target_sr)
            if result is True:
                resampled_files += 1
            elif result is None:
                error_files += 1
    
    print(f"Resampling process complete.")
    print(f"Total files checked: {total_files}")
    print(f"Files resampled: {resampled_files}")
    if error_files > 0:
        print(f"Files with errors: {error_files}")

# Usage
logging.basicConfig(level=logging.ERROR)  # Only log errors
resample_all_files('Converted Recordings')

Resampling process complete.
Total files checked: 690
Files resampled: 311


In [9]:
def calculate_snr(audio):
    """Calculate the signal-to-noise ratio of an audio clip."""
    signal = np.mean(audio**2)
    noise = np.mean((audio - np.mean(audio))**2)
    snr = 10 * np.log10(signal / noise)
    return snr

def is_too_noisy(audio, sr, threshold=-20):
    """Check if audio is too noisy based on its SNR."""
    snr = calculate_snr(audio)
    return snr < threshold

def has_long_silence(audio, sr, silence_threshold=-60, min_silence_duration=1.0):
    """Detects long silences within the audio clip."""
    intervals = librosa.effects.split(audio, top_db=-silence_threshold)
    if len(intervals) > 1:
        silence_durations = np.diff(intervals.ravel())[1::2] / sr
        return np.any(silence_durations >= min_silence_duration)
    return False

def check_spectral_spread(audio, sr, threshold=0.8):
    """Check if the spectral spread exceeds the specified threshold."""
    spec = np.abs(librosa.stft(audio))
    spectral_spread = np.sum(spec > np.mean(spec)) / spec.size
    return spectral_spread > threshold

def clean_audio(audio, sr, file_path, shared_discarded_files):
    """Cleans an audio file by removing noise, silence, and checking for spectral spread."""
    # Get file name for logging
    file_name = os.path.basename(file_path)

    # Check noise level
    if is_too_noisy(audio, sr):
        shared_discarded_files.append({'file_path': file_path, 'reason': 'too_noisy', 'snr': calculate_snr(audio)})
        return None
    
    # Check for long silences
    if has_long_silence(audio, sr):
        audio = librosa.effects.trim(audio, top_db=20)[0]
    
    # Check spectral spread
    if check_spectral_spread(audio, sr):
        shared_discarded_files.append({'file_path': file_path, 'reason': 'bad_spectral_spread'})
        return None
    
    return audio

### Testing the functions on a single file

In [10]:
converted_data.columns

Index(['genus', 'species', 'latitude', 'longitude', 'type', 'quality',
       'file_name', 'simplified_type', 'season', 'time_of_day',
       'length_seconds'],
      dtype='object')

In [11]:
# Define sample rate
sr = 44100

# Choose a random file
random_file = random.choice(converted_data['file_name'].tolist())
file_path = os.path.join('Converted Recordings', random_file)

# Load the audio file
audio, _ = librosa.load(file_path, sr=sr)

# Test calculate_snr function
snr = calculate_snr(audio)
print(f"calculate_snr completed. SNR: {snr}")

# Test is_too_noisy function
too_noisy = is_too_noisy(audio, sr)
print(f"is_too_noisy completed. Result: {too_noisy}")

# Test has_long_silence function
long_silence = has_long_silence(audio, sr)
print(f"has_long_silence completed. Result: {long_silence}")

# Test check_spectral_spread function
spectral_spread = check_spectral_spread(audio, sr)
print(f"check_spectral_spread completed. Result: {spectral_spread}")

# Test clean_audio function
shared_discarded_files = []
cleaned_audio = clean_audio(audio, sr, file_path, shared_discarded_files)
print(f"clean_audio completed. Cleaned audio returned: {'Yes' if cleaned_audio is not None else 'No'}")
if shared_discarded_files:
    print(f"File discarded. Reason: {shared_discarded_files[0]['reason']}")

print(f"\nTested file: {file_path}")

calculate_snr completed. SNR: -2.5885968923944347e-07
is_too_noisy completed. Result: False
has_long_silence completed. Result: False
check_spectral_spread completed. Result: False
clean_audio completed. Cleaned audio returned: Yes

Tested file: Converted Recordings\Antigone_canadensis_Troy_Township_near__Delaware_Delaware_County_Ohio_625913.wav


### Test duplication functions two files

In [12]:
def get_audio_fingerprint(audio, sr):
    n_fft = min(2048, len(audio))
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13, n_fft=n_fft)
    return np.mean(mfccs, axis=1)

def are_near_duplicates(audio1, sr1, audio2, sr2, threshold=0.99):
    if len(audio1) == 0 or len(audio2) == 0:
        raise ValueError("One or both audio files are empty")
    
    if sr1 != sr2:
        print(f"Warning: Sample rates differ ({sr1} vs {sr2}). Resampling may be necessary.")
    
    fp1 = get_audio_fingerprint(audio1, sr1)
    fp2 = get_audio_fingerprint(audio2, sr2)
    
    if len(fp1) != len(fp2):
        raise ValueError("Fingerprints have different lengths")
    
    similarity = 1 - cosine(fp1, fp2)
    return similarity > threshold

def check_audio_duplicates(file_path, shared_duplicates, shared_discarded_files):
    try:
        # Load the audio
        audio, sr = librosa.load(file_path, sr=None)
        
        # Check for duplicates
        for existing_audio, existing_sr, existing_path in shared_duplicates:
            if are_near_duplicates(audio, sr, existing_audio, existing_sr):
                shared_discarded_files.append((file_path, 'duplicate'))
                return True  # It's a duplicate

        # If not a duplicate, store the audio and path
        shared_duplicates.append((audio, sr, file_path))
        return False  # It's not a duplicate
    except Exception as e:
        logging.error(f"Error checking duplicates for {file_path}: {str(e)}")
        shared_discarded_files.append((file_path, f'error: {str(e)}'))
        return None  # Error occurred

def check_and_remove_duplicates(directory='Converted Recordings'):
    shared_duplicates = []
    shared_discarded_files = []
    duplicate_files = []
    total_files = 0

    # Check each file in the directory for duplicates
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):  # Assuming we're only processing .wav files
            total_files += 1
            file_path = os.path.join(directory, filename)
            is_duplicate = check_audio_duplicates(file_path, shared_duplicates, shared_discarded_files)
            if is_duplicate:
                duplicate_files.append(file_path)

    # Log the results
    print(f"Total files checked: {total_files}")
    print(f"Duplicate files found: {len(duplicate_files)}")

    return duplicate_files, shared_discarded_files

In [13]:
### Check duplicate checking functions on two files

# Define sample rate
sr = 44100

# Choose two random files
random_files = random.sample(converted_data['file_name'].tolist(), 2)
file_paths = [os.path.join('Converted Recordings', file) for file in random_files]

# Load the audio files
audio1, _ = librosa.load(file_paths[0], sr=sr)
audio2, _ = librosa.load(file_paths[1], sr=sr)

# Test get_audio_fingerprint function
print("Testing get_audio_fingerprint function:")
fingerprint1 = get_audio_fingerprint(audio1, sr)
print(f"get_audio_fingerprint completed for file 1.")
print(f"Fingerprint shape: {fingerprint1.shape}")
print(f"Fingerprint: {fingerprint1}")

fingerprint2 = get_audio_fingerprint(audio2, sr)
print(f"get_audio_fingerprint completed for file 2.")
print(f"Fingerprint shape: {fingerprint2.shape}")
print(f"Fingerprint: {fingerprint2}")

# Test are_near_duplicates function with different thresholds
print("\nTesting are_near_duplicates function:")
thresholds = [0.95, 0.99, 0.999]
for threshold in thresholds:
    try:
        similarity = 1 - cosine(fingerprint1, fingerprint2)
        are_duplicates = similarity > threshold
        print(f"Threshold: {threshold}")
        print(f"Similarity score: {similarity}")
        print(f"Result: The two files are {'near duplicates' if are_duplicates else 'not near duplicates'}.")
    except ValueError as e:
        print(f"Error occurred: {str(e)}")

print(f"\nTested files:")
print(f"File 1: {file_paths[0]}")
print(f"File 2: {file_paths[1]}")

Testing get_audio_fingerprint function:
get_audio_fingerprint completed for file 1.
Fingerprint shape: (13,)
Fingerprint: [-494.45636    104.198814   -51.348476    74.571915    -5.350827
   15.922553     5.5589075   27.072725     4.6986046   17.892286
    3.1821306    7.2147174   11.225792 ]
get_audio_fingerprint completed for file 2.
Fingerprint shape: (13,)
Fingerprint: [-4.9401114e+02  5.4011196e+01 -9.4060448e+01  7.3398542e+00
 -2.4808609e+01 -4.1304646e+01  1.9690827e+00 -7.2921305e+00
 -1.8904139e+01 -1.2211078e+01 -7.5644488e+00 -3.7751703e+00
  4.4985771e-01]

Testing are_near_duplicates function:
Threshold: 0.95
Similarity score: 0.9704359288487716
Result: The two files are near duplicates.
Threshold: 0.99
Similarity score: 0.9704359288487716
Result: The two files are not near duplicates.
Threshold: 0.999
Similarity score: 0.9704359288487716
Result: The two files are not near duplicates.

Tested files:
File 1: Converted Recordings\Troglodytes_aedon_Pelee_near__Leamington_Esse

In [14]:
# # Apply duplicate checking to all files
# logging.basicConfig(level=logging.INFO)
# duplicates, discarded_files = check_and_remove_duplicates('Converted Recordings')

# print(f"Duplicate files found: {len(duplicates)}")
# print("List of duplicate files:")
# for file in duplicates:
#     print(file)

# print(f"\nTotal discarded files: {len(discarded_files)}")
# print("List of discarded files and reasons:")
# for file, reason in discarded_files:
#     print(f"{file}: {reason}")

In [20]:
def process_audio(file_path, shared_discarded_files, target_length=5, overlap=0.5, target_sr=44100):
    print(f"Processing file: {file_path}")  # For process_audio
    try:
        # Load and clean the audio
        audio, sr = librosa.load(file_path, sr=None)
        audio = clean_audio(audio, sr, file_path, shared_discarded_files)
        if audio is None:
            return None, None
        
        # Check if audio is shorter than 4410 samples (100ms at 44.1kHz)
        if len(audio) < 4410:
            shared_discarded_files.append((file_path, 'too_short'))
            return None, None
            
        # Convert target_length to samples
        target_samples = int(sr * target_length)
        
        # If audio is shorter than target length, pad with zeros
        if len(audio) < target_samples:
            audio = np.pad(audio, (0, target_samples - len(audio)))
        
        # If audio is longer than target length, segment with overlap
        else:
            segments = []
            for start in range(0, len(audio), int(target_samples * (1 - overlap))):
                segment = audio[start:start + target_samples]
                if len(segment) == target_samples:
                    segments.append(segment)
                elif len(segment) > 0:
                    segment = np.pad(segment, (0, target_samples - len(segment)))
                    segments.append(segment)
            audio = np.array(segments)

        return audio, sr
    except Exception as e:
        logging.error(f"Error processing {file_path}: {str(e)}")
        shared_discarded_files.append((file_path, f'error: {str(e)}'))
        return None, None


In [21]:
def process_file(args):
    row, audio_dir, output_dir, shared_discarded_files = args
    file_path = os.path.join(audio_dir, row['file_name'])
    print(f"Row: {row['file_name']}")  # For process_file
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return None
    
    processed_audio, sr = process_audio(file_path, shared_discarded_files)
    if processed_audio is None:
        return None
    
    processed_data = []
    if processed_audio.ndim == 2:
        for i, segment in enumerate(processed_audio):
            new_row = row.copy()
            base_filename = f"{os.path.splitext(row['file_name'])[0]}_segment_{i}"
            new_row['processed_file'] = f"{base_filename}.wav"
            wavfile.write(os.path.join(output_dir, new_row['processed_file']), sr, segment)
            processed_data.append(new_row)
    else:
        base_filename = f"{os.path.splitext(row['file_name'])[0]}_processed"
        row['processed_file'] = f"{base_filename}.wav"
        wavfile.write(os.path.join(output_dir, row['processed_file']), sr, processed_audio)
        processed_data.append(row)
    
    return processed_data


In [22]:
def process_dataset(converted_data, audio_dir, output_dir):
    manager = multiprocessing.Manager()
    shared_discarded_files = manager.list()
    
    print(f"Audio directory: {audio_dir}, Output directory: {output_dir}")
    results = []
    
    for _, row in converted_data.iterrows():
        results.append(process_file((row, audio_dir, output_dir, shared_discarded_files)))
    
    processed_data = [item for sublist in results if sublist is not None for item in sublist]

    # Save discarded files to a DataFrame and export as CSV
    discard_log_df = pd.DataFrame(list(shared_discarded_files), columns=['file_path', 'reason'])
    discard_log_df.to_csv('discarded_audio_log.csv', index=False)

    return pd.DataFrame(processed_data)

In [18]:
from multiprocessing import Manager

## Testing on a single file
# Select a random file
random_file = random.choice(converted_data['file_name'])
file_path = os.path.join('Converted Recordings', random_file)

print(f"Testing audio processing on file: {random_file}")

# Create a dummy row for testing
test_row = pd.Series({'file_name': random_file})

# Create necessary directories
output_dir = 'Test_Processed_Recordings'
os.makedirs(output_dir, exist_ok=True)

# Create shared lists for multiprocessing simulation
manager = Manager()
shared_duplicates = manager.list()
shared_discarded_files = manager.list()

# Process the audio file
processed_audio, sr = process_audio(file_path, shared_discarded_files)

if processed_audio is not None:
    print(f"Audio processed successfully.")
    print(f"Processed audio shape: {processed_audio.shape}")
    print(f"Sample rate: {sr}")

    # Simulate the process_file function
    args = (test_row, 'Converted Recordings', output_dir, shared_duplicates, shared_discarded_files)
    processed_data = process_file(args)

    if processed_data is not None:
        print("\nProcessed data:")
        for item in processed_data:
            print(f"Processed file: {item['processed_file']}")
            print(f"Number of segments: {len(processed_data)}")

        # Verify the output files
        for item in processed_data:
            output_file = os.path.join(output_dir, item['processed_file'])
            if os.path.exists(output_file):
                print(f"Output file created: {output_file}")
                # Load and print some information about the output file
                audio, sr = librosa.load(output_file, sr=None)
                print(f"Output audio duration: {librosa.get_duration(y=audio, sr=sr):.2f} seconds")
            else:
                print(f"Error: Output file not created: {output_file}")
    else:
        print("Error: process_file returned None")
else:
    print("Error: Audio processing failed")

# Check for any discarded files
if shared_discarded_files:
    print("\nDiscarded files:")
    for file_path, reason in shared_discarded_files:
        print(f"{file_path}: {reason}")
else:
    print("\nNo files were discarded.")

print("\nAudio processing test complete.")

Testing audio processing on file: Vermivora_cyanoptera_Irwin_Prairie_SNP_Ohio_418083.wav


ERROR:root:Error processing Converted Recordings\Vermivora_cyanoptera_Irwin_Prairie_SNP_Ohio_418083.wav: '<' not supported between instances of 'int' and 'list'


Processing file: Converted Recordings\Vermivora_cyanoptera_Irwin_Prairie_SNP_Ohio_418083.wav
Audio processed successfully.
Processed audio shape: (220500,)
Sample rate: 44100
Row: Vermivora_cyanoptera_Irwin_Prairie_SNP_Ohio_418083.wav
Processing file: Converted Recordings\Vermivora_cyanoptera_Irwin_Prairie_SNP_Ohio_418083.wav
Error: process_file returned None

No files were discarded.

Audio processing test complete.


In [23]:
# Main execution
converted_dir = 'Converted Recordings'
processed_dir = 'Processed Recordings'

# Ensure output directory exists
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

processed_data = process_dataset(converted_data, converted_dir, processed_dir)
print('Audio Processing Complete')

# Print completion message and count files in Processed Recordings directory
processed_file_count = len([f for f in os.listdir(processed_dir) if f.endswith('.wav')])
print(f"\nAudio processing is complete. There are now {processed_file_count} files in the '{processed_dir}' directory.")

# Output discarded files
discard_log_df = pd.read_csv('discarded_audio_log.csv')
if not discard_log_df.empty:
    print("\nThe following files were discarded:")
    for _, row in discard_log_df.iterrows():
        print(f"{row['file_path']}: {row['reason']}")
else:
    print("\nNo files were discarded.")

Audio directory: Converted Recordings, Output directory: Processed Recordings
Row: Branta_canadensis_Whitewater_Township_near__Harrison_Hamilton_County_Ohio_726750.wav
Processing file: Converted Recordings\Branta_canadensis_Whitewater_Township_near__Harrison_Hamilton_County_Ohio_726750.wav
Row: Branta_canadensis_Lawrence_Woods_SNP_418000.wav
Processing file: Converted Recordings\Branta_canadensis_Lawrence_Woods_SNP_418000.wav
Row: Branta_canadensis_Miami_Whitewater_Forest_Park_wetlands_Crosby_Township_near__Harrison_Hamilton_County_Ohio_691528.wav
Processing file: Converted Recordings\Branta_canadensis_Miami_Whitewater_Forest_Park_wetlands_Crosby_Township_near__Harrison_Hamilton_County_Ohio_691528.wav
Row: Cygnus_buccinator_Killdeer_Plains_Wildlife_Management_Area_Wyandot_County_Ohio_713788.wav
Processing file: Converted Recordings\Cygnus_buccinator_Killdeer_Plains_Wildlife_Management_Area_Wyandot_County_Ohio_713788.wav
Row: Aix_sponsa_Magee_Marsh_-_boardwalk_Lucas_County_Ohio_815809.w

In [34]:
print(processed_data.info())
processed_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None


### Data Augmentation
### Augment the processed audio files.
- Pitch Shift
- Time Stretch
- add_noise
- change_speed
- apply_filter
- add_background
- time_shift
- augment_audio
- 

In [1]:

def generate_wind_sound(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    wind = np.random.normal(0, 0.1, int(sr * duration))
    wind_filtered = np.convolve(wind, np.ones(1000)/1000, mode='same')
    return wind_filtered / np.max(np.abs(wind_filtered))

def generate_leaf_rustle(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    leaf = np.random.normal(0, 0.1, int(sr * duration))
    envelope = np.exp(-t * 10) * np.sin(2 * np.pi * 2 * t)**2
    return leaf * envelope / np.max(np.abs(leaf * envelope))

def generate_water_sound(duration, sr):
    t = np.linspace(0, duration, int(sr * duration), False)
    water = np.random.normal(0, 0.1, int(sr * duration))
    water_filtered = np.convolve(water, np.ones(500)/500, mode='same')
    ripple = np.sin(2 * np.pi * 2 * t) * np.exp(-t * 2)
    return (water_filtered + ripple) / np.max(np.abs(water_filtered + ripple))

def mix_nature_sounds(duration, sr):
    wind = generate_wind_sound(duration, sr)
    leaf = generate_leaf_rustle(duration, sr)
    water = generate_water_sound(duration, sr)
    
    mix = wind * 0.7 + leaf * 0.2 + water * 0.1
    return mix / np.max(np.abs(mix))

# Generate a 5-second mix of nature-like sounds
sr = 44100
duration = 5
nature_background = mix_nature_sounds(duration, sr)

NameError: name 'np' is not defined

In [36]:

def pitch_shift(audio, sr, n_steps):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)

def time_stretch(audio, rate):
    return librosa.effects.time_stretch(audio, rate=rate)

def add_noise(audio, noise_factor):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    return np.clip(augmented_audio, -1, 1)

def change_speed(audio, speed_factor):
    return librosa.effects.time_stretch(audio, rate=1/speed_factor)

def apply_filter(audio, sr, filter_type='lowpass', cutoff=1000):
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = butter(4, normal_cutoff, btype=filter_type, analog=False)
    return lfilter(b, a, audio)

def add_background(audio, background, ratio=0.1):
    if len(background) > len(audio):
        start = np.random.randint(0, len(background) - len(audio))
        background = background[start:start+len(audio)]
    else:
        background = np.pad(background, (0, max(0, len(audio) - len(background))))
    return audio + ratio * background

def time_shift(audio, shift_max, roll_prob=0.5):
    shift = np.random.randint(-shift_max, shift_max)
    if random.random() < roll_prob:
        return np.roll(audio, shift)
    else:
        if shift > 0:
            return np.pad(audio, (shift, 0))[:len(audio)]
        else:
            return np.pad(audio, (0, -shift))[:-shift]

def augment_audio(audio, sr):
    augmentations = [
        (pitch_shift, {'n_steps': random.uniform(-2, 2)}),
        (time_stretch, {'rate': random.uniform(0.8, 1.2)}),
        (add_noise, {'noise_factor': random.uniform(0.001, 0.015)}),
        (change_speed, {'speed_factor': random.uniform(0.9, 1.1)}),
        (apply_filter, {'filter_type': random.choice(['lowpass', 'highpass']),
                        'cutoff': random.uniform(1000, 4000)}),
        (time_shift, {'shift_max': int(sr * 0.5)})
    ]
    
    # Randomly select 2-4 augmentations
    num_augments = random.randint(2, 4)
    selected_augments = random.sample(augmentations, num_augments)
    
    applied_augmentations = []
    
    # Apply selected augmentations
    for augment_func, params in selected_augments:
        audio = augment_func(audio, sr, **params)
        applied_augmentations.append(f"{augment_func.__name__}:{','.join(f'{k}={v}' for k, v in params.items())}")
    
    # Add synthesized nature background
    if random.random() < 0.5:
        nature_background = mix_nature_sounds(len(audio) / sr, sr)
        ratio = random.uniform(0.1, 0.3)
        audio = add_background(audio, nature_background, ratio=ratio)
        applied_augmentations.append(f"add_background:ratio={ratio:.2f}")
    
    return audio, applied_augmentations



In [37]:

def augment_and_save(input_file, output_dir, num_augmentations=3):
    try:
        # Load the audio file
        audio, sr = librosa.load(input_file, sr=None)
        
        augmented_files = []
        all_applied_augmentations = []
        
        for i in range(num_augmentations):
            # Apply augmentation
            augmented_audio, applied_augmentations = augment_audio(audio, sr)
            
            # Generate new filename
            base_name = os.path.basename(input_file)
            name, ext = os.path.splitext(base_name)
            new_name = f"{name}_aug_{i+1}{ext}"
            output_path = os.path.join(output_dir, new_name)
            
            # Save augmented audio
            sf.write(output_path, augmented_audio, sr)
            
            augmented_files.append(output_path)
            all_applied_augmentations.append(';'.join(applied_augmentations))
        
        return augmented_files, all_applied_augmentations
    
    except Exception as e:
        print(f"Error processing {input_file}: {str(e)}")
        return [], []


def process_dataframe(df, input_dir, output_dir, num_augmentations=3):
    new_rows = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"):
        input_file = os.path.join(input_dir, row['file_path'])
        augmented_files, augmentations = augment_and_save(input_file, output_dir, num_augmentations)
        
        for aug_file, aug_details in zip(augmented_files, augmentations):
            new_row = row.copy()
            new_row['file_path'] = os.path.relpath(aug_file, output_dir)
            new_row['augmentations'] = aug_details
            new_rows.append(new_row)
    
    augmented_df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    return augmented_df


Processing files: 0it [00:00, ?it/s]

Augmentation complete. 0 new samples created.





In [3]:
# Test Augmentation functions

# Select a random file
random_file = random.choice(processed_data['file_name'])
file_path = os.path.join('Processed Recordings', random_file)

# Load the audio file
audio, sr = librosa.load(file_path, sr=None)

# List of augmentation functions to test
augmentation_functions = [
    (pitch_shift, {'n_steps': 2}),
    (time_stretch, {'rate': 1.2}),
    (add_noise, {'noise_factor': 0.01}),
    (change_speed, {'speed_factor': 1.1}),
    (apply_filter, {'filter_type': 'lowpass', 'cutoff': 2000}),
    (time_shift, {'shift_max': int(sr * 0.5)}),
    (mix_nature_sounds, {'duration': len(audio) / sr})
]

print(f"Testing augmentations on file: {random_file}")

# Apply each augmentation function and save the result
for i, (func, params) in enumerate(augmentation_functions):
    if func.__name__ == 'mix_nature_sounds':
        # For mix_nature_sounds, we need to handle it differently
        background = func(**params)
        augmented = add_background(audio, background, ratio=0.2)
    else:
        # For other functions, apply them directly
        augmented = func(audio, sr, **params)
    
    # Generate output filename
    base_name = os.path.splitext(random_file)[0]
    output_file = f"{base_name}_aug_{func.__name__}.wav"
    output_path = os.path.join('Augmented Recordings', output_file)
    
    # Save the augmented audio
    sf.write(output_path, augmented, sr)
    
    print(f"Applied {func.__name__}, saved as {output_file}")

print("Augmentation test complete.")

NameError: name 'processed_data' is not defined

In [None]:

# Apply augmentation to every file
input_dir = 'Processed Recordings'
output_dir = 'Augmented Recordings'
num_augmentations = 3


# Process the dataframe
augmented_data = process_dataframe(processed_data, input_dir, output_dir, num_augmentations)

print(f"Augmentation complete. {len(augmented_data) - len(processed_data)} new samples created.")

### Extract features from the processed and augmented audio files.

In [38]:
# Drop original recordings from dataset for feature extraction
def filter_dataset(df, keep_dirs=['Processed Recordings', 'Augmented Recordings']):
    # Convert keep_dirs to a set for faster lookup
    keep_dirs_set = set(keep_dirs)
    
    # Function to check if a file path is in one of the keep_dirs
    def is_keep_dir(file_path):
        return any(dir_name in file_path for dir_name in keep_dirs_set)
    
    # Filter the dataframe
    filtered_df = df[df['file_path'].apply(lambda x: is_keep_dir(x))]
    
    # Reset the index of the filtered dataframe
    filtered_df = filtered_df.reset_index(drop=True)
    
    # Print some information about the filtering process
    print(f"Original dataset size: {len(df)}")
    print(f"Filtered dataset size: {len(filtered_df)}")
    print(f"Removed {len(df) - len(filtered_df)} entries")
    
    return filtered_df

# Usage
filtered_data = filter_dataset(augmented_data)

KeyError: 'file_path'

In [None]:

def extract_features(audio, sr):
    # Mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    
    # Spectral Centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
    
    # Chroma Features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    
    # Zero Crossing Rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)[0]
    
    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
    
    return {
        'mel_spectrogram_db': mel_spec_db,
        'mfccs': mfccs,
        'spectral_centroids': spectral_centroids,
        'chroma': chroma,
        'zero_crossing_rate': zero_crossing_rate,
        'spectral_rolloff': spectral_rolloff
    }

def summarize_feature(feature):
    if feature.ndim == 1:
        return [np.mean(feature), np.std(feature), np.max(feature)]
    elif feature.ndim == 2:
        return np.hstack([
            np.mean(feature, axis=1),
            np.std(feature, axis=1),
            np.max(feature, axis=1)
        ])
    
def save_mel_spectrogram(mel_spec, output_dir, base_filename, sr):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-spectrogram')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{base_filename}_mel_spectrogram.png"))
    plt.close()
    

In [None]:
def process_audio_file(file_path, output_dir):
    try:
        audio, _ = librosa.load(file_path, sr=sr)
        features = extract_features(audio, sr)
        
        # Summarize features
        feature_summary = {}
        for key, value in features.items():
            if key != 'mel_spectrogram':
                feature_summary[f"{key}_summary"] = summarize_feature(value)
        
        # Create feature vector
        feature_vector = np.hstack([
            feature_summary.get('mfccs_summary', np.array([])),
            feature_summary.get('spectral_centroids_summary', np.array([])),
            feature_summary.get('chroma_summary', np.array([])),
            feature_summary.get('zero_crossing_rate_summary', np.array([])),
            feature_summary.get('spectral_rolloff_summary', np.array([]))
            ])
        
        # Save mel-spectrogram as image
        base_filename = os.path.splitext(os.path.basename(file_path))[0]
        save_mel_spectrogram(features['mel_spectrogram'], output_dir, base_filename)
        
        return feature_vector, features
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None, None

# Main processing loop
def process_audio_files(filtered_data, output_dir):
    feature_data = []
    
    for _, row in tqdm(filtered_data.iterrows(), total=len(filtered_data)):
        file_path = row['file_name']
        feature_vector, full_features = process_audio_file(file_path, output_dir)
        
        if feature_vector is not None and full_features is not None:
            feature_dict = {
                'file_name': file_path,
                'feature_vector': feature_vector,
            }
            
            # Add full feature arrays
            for key, value in full_features.items():
                feature_dict[f"{key}_full"] = value
            
            feature_data.append(feature_dict)
    
    return feature_data


In [4]:
# Select a random file
random_file = random.choice(filtered_data['file_name'])
file_path = os.path.join('Processed Recordings', random_file)

# Load the audio file
audio, sr = librosa.load(file_path, sr=None)

print(f"Testing feature extraction on file: {random_file}")

# Extract features
features = extract_features(audio, sr)

# Print a summary of each feature
for feature_name, feature_data in features.items():
    if feature_name == 'mel_spectrogram_db':
        print(f"{feature_name} shape: {feature_data.shape}")
    else:
        print(f"{feature_name} shape: {feature_data.shape}, mean: {np.mean(feature_data):.4f}, std: {np.std(feature_data):.4f}")

# Summarize features
feature_summary = {}
for key, value in features.items():
    if key != 'mel_spectrogram_db':
        feature_summary[f"{key}_summary"] = summarize_feature(value)

# Print summary of summarized features
print("\nSummarized Features:")
for key, value in feature_summary.items():
    print(f"{key} shape: {value.shape}, mean: {np.mean(value):.4f}, std: {np.std(value):.4f}")

# Create feature vector
feature_vector = np.hstack([
    feature_summary.get('mfccs_summary', np.array([])),
    feature_summary.get('spectral_centroids_summary', np.array([])),
    feature_summary.get('chroma_summary', np.array([])),
    feature_summary.get('zero_crossing_rate_summary', np.array([])),
    feature_summary.get('spectral_rolloff_summary', np.array([]))
])

print(f"\nFinal feature vector shape: {feature_vector.shape}")

# Save mel-spectrogram as image
output_dir = 'test_mel_spectrograms'
os.makedirs(output_dir, exist_ok=True)
base_filename = os.path.splitext(os.path.basename(file_path))[0]
save_mel_spectrogram(features['mel_spectrogram_db'], output_dir, base_filename, sr)

print(f"Mel-spectrogram saved as: {base_filename}_mel_spectrogram.png in {output_dir}")

print("Feature extraction test complete.")

NameError: name 'filtered_data' is not defined

In [5]:

# Apply feature extraction to every file
output_dir = 'mel-spectrograms'
final_data = process_audio_files(filtered_data, output_dir)

# Convert to DataFrame
df = pd.DataFrame([{'file_name': item['file_name'], 'feature_vector': item['feature_vector']} for item in final_data])




NameError: name 'process_audio_files' is not defined

In [None]:
df.info()

In [None]:
df = df.drop(columns=["quality"])

In [None]:
# Save DataFrame to CSV
df.to_csv('final_data.csv', index=False)

print("Processing complete. Summary data saved to 'final_data.csv'.")