In [1]:
from datasets import load_dataset
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
import librosa
from IPython.display import Audio
import torch


In [2]:
# load dataset 
dataset = load_dataset('DBD-research-group/BirdSet', 'HSN', trust_remote_code=True)


In [3]:
# Define a function for preprocessing audio; trim the whole auio files with the same duration and same sampleRate
def preprocess_audio(example):
    # Load the audio file using librosa from the 'filepath'
    y, sr = librosa.load(example['filepath'], sr=None)
    
    # Convert to mono if it's stereo
    y_mono = librosa.to_mono(y)
    
    # Resample the audio to exactly 22,000 Hz
    target_sr = 22000
    y_resampled = librosa.resample(y_mono, orig_sr=sr, target_sr=target_sr)
    
    # Calculate the number of samples for 5 seconds
    target_samples = target_sr * 5  # 5 seconds = 110,000 samples

    # Trim or pad the audio to exactly 5 seconds (110,000 samples)
    y_trimmed = librosa.util.fix_length(y_resampled, size=target_samples)
    
    # Check the duration of the audio after processing
    duration = len(y_trimmed) / target_sr
    print(f"Processed audio duration: {duration} seconds")  # For debugging purposes
    print(f"Number of samples: {len(y_trimmed)}")
    
    # Add the processed audio directly to the example
    example['processed_audio'] = y_trimmed
    example['sampling_rate'] = target_sr

    return example

# Apply the preprocessing function to a sample from the dataset
preprocessed = dataset['train'].select(range(10)).map(preprocess_audio)

# Play one of the processed audio samples
y_trimmed = preprocessed[0]['processed_audio']
sampling_rate = preprocessed[0]['sampling_rate']
display(Audio(y_trimmed, rate=sampling_rate))

# Export the processed audio to a WAV file for further testing
sf.write('processed_audio.wav', y_trimmed, samplerate=sampling_rate)
print("Audio file 'processed_audio.wav' saved successfully.")

Audio file 'processed_audio.wav' saved successfully.
