In [1]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import pandas as pd
import numpy as np
import librosa
import os
import json
from joblib import Parallel, delayed
import math

In [7]:
# Get data into colab
ZIP_PATH = "/content/drive/My Drive/Summer 2025/ESC-50 Water Project/audio/ESC-50-master.zip"
DESTINATION_DIR = "/content/"

!unzip -q "{ZIP_PATH}" -d "{DESTINATION_DIR}"

In [8]:
# Create constants

# File paths
ESC50_DIR = "/content/ESC-50-master/"
AUDIO_DIR = os.path.join(ESC50_DIR, "audio")
CSV_PATH = os.path.join(ESC50_DIR, "meta/esc50.csv")

# Directory to save the new .npz files
OUTPUT_DIR_NPZ = "/content/drive/My Drive/Summer 2025/Python Practice/audio/npz_files"

# Audio processing constant
SAMPLE_RATE = 22050

In [9]:
# Load csv using pandas
df = pd.read_csv(CSV_PATH)

In [23]:
# Trail 1

'''
Will only be focusing on water and non-water categories, with sub categories
within water
- 0: non_water
- 1: pouring_water
- 2: toilet_flush
- 3: water_drops
'''

# use dictionary to map target sound categories to integer labels
label_mapping = {
    'pouring_water': 1,
    'toilet_flush': 2,
    'water_drops': 3
}

In [24]:
# Create new column
def assign_label1(category):
    """
    Applies the label mapping to the dataset. If a category is not in the mapping,
    it's considered 'non_water' and gets the label 0

    Args:
        category (str): The category name.

    Returns:
        int: The corresponding label.
    """
    return label_mapping.get(category, 0)

In [25]:
# Apply the function to the category column to create new 'target_label' column
df['target_label'] = df['category'].apply(assign_label1)

# Print the first 5 rows for verification
print("DataFrame with new 'target_label' column:")
print(df.head())

# Print counts of each class to see the totals
print("Totals:")
print(df['target_label'].value_counts())

DataFrame with new 'target_label' column:
            filename  fold  target        category  esc10  src_file take  \
0   1-100032-A-0.wav     1       0             dog   True    100032    A   
1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A   
2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A   
3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B   
4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A   

   target_label  
0             0  
1             0  
2             0  
3             0  
4             0  
Totals:
target_label
0    1880
1      40
3      40
2      40
Name: count, dtype: int64


In [27]:
# Trail 2

'''
This time will only be focusing on natural sounds, but with all sub categories
within natural sounds
- 0: Rain
- 1: Sea waves
- 2: Crackling fire
- 3: Crickets
- 4: Chirping birds
- 5: Water drops
- 6: Wind
- 7: Pouring water
- 8: Toilet flush
- 9: Thunderstorm
'''

label_mapping = {
    'rain': 0,
    'sea_waves': 1,
    'crackling_fire': 2,
    'crickets': 3,
    'chirping_birds': 4,
    'water_drops': 5,
    'wind': 6,
    'pouring_water': 7,
    'toilet_flush': 8,
    'thunderstorm': 9
}

In [28]:
# Create new column
def assign_label2(category):
    """
    Applies the label mapping to the dataset. If a category is not in the mapping,
    it's considered 'non_water' and gets the label 0

    Args:
        category (str): The category name.

    Returns:
        int: The corresponding label.
    """
    return label_mapping.get(category, 10)

In [29]:
# Apply the function to the category column to create new 'target_label' column
df['target_label'] = df['category'].apply(assign_label2)

# Print the first 5 rows for verification
print("DataFrame with new 'target_label' column:")
print(df.head())

# Print counts of each class to see the totals
print("Totals:")
print(df['target_label'].value_counts())

DataFrame with new 'target_label' column:
            filename  fold  target        category  esc10  src_file take  \
0   1-100032-A-0.wav     1       0             dog   True    100032    A   
1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A   
2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A   
3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B   
4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A   

   target_label  
0            10  
1             4  
2            10  
3            10  
4             9  
Totals:
target_label
10    1600
4       40
9       40
7       40
5       40
6       40
2       40
0       40
8       40
1       40
3       40
Name: count, dtype: int64


In [26]:
# Separate the 4 categories
df_no_water = df[df['target_label'] == 0]
df_pouring = df[df['target_label'] == 1]
df_flush = df[df['target_label'] == 2]
df_drops = df[df['target_label'] == 3]

# Randomly sample the 'non_water' class to match 'water' class (3 combined)
non_water = 120
df_no_water_balanced = df_no_water.sample(n=non_water, random_state=42)

# Combine the balanced 'non_water' samples with all the water samples
df_balanced = pd.concat([df_no_water_balanced, df_pouring, df_flush, df_drops])


# Print counts of each class to see the totals
print("Balanced DataFrame Totals:")
print(df_balanced['target_label'].value_counts())

Balanced DataFrame Totals:
target_label
0    120
1     40
2     40
3     40
Name: count, dtype: int64


In [30]:
# Seperate the 10 natural sounds
df_rain = df[df['target_label'] == 0]
df_waves = df[df['target_label'] == 1]
df_fire = df[df['target_label'] == 2]
df_crickets = df[df['target_label'] == 3]
df_birds = df[df['target_label'] == 4]
df_drops = df[df['target_label'] == 5]
df_wind = df[df['target_label'] == 6]
df_water = df[df['target_label'] == 7]
df_flush = df[df['target_label'] == 8]
df_storm = df[df['target_label'] == 9]

# Combine the balanced 'non_water' samples with all the water samples
df_balanced = pd.concat([df_rain, df_waves, df_fire, df_crickets, df_birds,
                         df_drops, df_wind, df_water, df_flush, df_storm])

# Print counts of each class to see the totals
print("Balanced DataFrame Totals:")
print(df_balanced['target_label'].value_counts())

Balanced DataFrame Totals:
target_label
0    40
1    40
2    40
3    40
4    40
5    40
6    40
7    40
8    40
9    40
Name: count, dtype: int64


In [31]:
def process_audio_file(file_path, label, num_mfcc, n_fft, hop_length, segment_duration_s):
    """
    Loads one 5-second audio file, extracts its MFCCs, and slices them
    into smaller segments for a certain duration

    Args:
        file_path (str): Path to the audio file
        label (int): Label for the audio file
        num_mfcc (int): Number of MFCCs to extract
        n_fft (int): FFT size
        hop_length (int): Hop length for the FFT
        segment_duration_s (float): Duration of each segment in seconds

    Returns:
        mfcc_segments (list): List of MFCC segments
        labels_for_segments (list): List of labels for each segment
    """

    try:
        # Load audio file
        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Extract MFCCs for the whole audio
        mfcc_full = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)

        # For segmentation
        mfcc_segments = []
        labels_for_segments = []

        total_frames = mfcc_full.shape[1]
        samples_per_segment = int(segment_duration_s * sr)

        # Calculate how many MFCC frames are in each segment
        frames_per_segment = math.ceil(samples_per_segment / hop_length)

        # Calculate the number of segments that fit in the clip
        num_segments = total_frames // frames_per_segment

        # Snip the mfcc into it's segments
        for i in range(num_segments):
            start = i * frames_per_segment
            end = start + frames_per_segment
            # Get the segment and transpose so time is the first dimension
            segment = mfcc_full[:, start:end].T
            mfcc_segments.append(segment)
            labels_for_segments.append(label)

        return mfcc_segments, labels_for_segments
    # got a bad audio file
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return [], []

In [33]:
def save_features_to_npz1(dataframe, num_mfcc=32, n_fft=512, overlap=0.5, segment_duration_s=1.0):
    """
    Manages the extraction of MFCCs from all audio files and saves to a .npz file.
    Uses parallel processing to speed up the process.

    Args:
        dataframe (pd.DataFrame): DataFrame containing audio file paths and labels
        num_mfcc (int): Number of MFCCs to extract
        n_fft (int): FFT size
        overlap (float): Percentage of overlap between segments
        segment_duration_s (float): Duration of each segment in seconds
    """

    # Calculate hop_length using n_fft and the overlap percentage
    hop_length = int(n_fft * (1 - overlap))
    overlap_percent = int(overlap * 100)

    # Calculate the duration of each segment in milliseconds for the filename
    duration_ms = int(segment_duration_s * 1000)

    # Create a filename based on the parameters
    '''
    # multiple
    npz_filename = f"esc50_mfcc_n{num_mfcc}_fft{n_fft}_ovlp{overlap_percent}_seg{duration_ms}ms_new.npz"
    '''
    # single
    npz_filename = f"esc50_mfcc_n{num_mfcc}_fft{n_fft}_ovlp{overlap_percent}_seg{duration_ms}ms_natural.npz"

    output_path = os.path.join(OUTPUT_DIR_NPZ, npz_filename)
    print(f"Starting job for: {npz_filename}")

    # Prepare "Jobs" for paralell processing
    jobs = []
    for index, row in dataframe.iterrows():
        file_path = os.path.join(AUDIO_DIR, row['filename'])
        label = row['target_label']
        # passed as a tupple
        jobs.append((file_path, label))

    # Run Parallel Processing
    results = Parallel(n_jobs=-1, verbose=1)(
        delayed(process_audio_file)(
            file_path, label, num_mfcc, n_fft, hop_length, segment_duration_s
        ) for file_path, label in jobs
    )

    # Collect Results
    # Use .extend() since jobs return a list of segments
    final_mfccs = []
    final_labels = []
    for mfcc_list, label_list in results:
        final_mfccs.extend(mfcc_list)
        final_labels.extend(label_list)

    # Save to .npz
    '''
    np.savez_compressed(output_path,
                      mfcc=final_mfccs,
                      labels=final_labels,
                      mapping=['no_water', 'pouring_water', 'toilet_flush', 'water_drops'])
    '''
    np.savez_compressed(output_path,
                      mfcc=final_mfccs,
                      labels=final_labels,
                      mapping=['rain', 'sea_waves', 'crackling_fire', 'crickets', 'chirping_birds',
                               'water_drops', 'wind', 'pouring_water', 'toilet_flush', 'thunderstorm'])

    print(f"Successfully saved data to: {output_path}\\n")

In [34]:
def save_features_to_npz2(dataframe, num_mfcc=32, n_fft=512, overlap=0.5, segment_duration_s=1.0):
    """
    Manages the extraction of MFCCs from all audio files and saves to a .npz file.
    Uses parallel processing to speed up the process.

    Args:
        dataframe (pd.DataFrame): DataFrame containing audio file paths and labels
        num_mfcc (int): Number of MFCCs to extract
        n_fft (int): FFT size
        overlap (float): Percentage of overlap between segments
        segment_duration_s (float): Duration of each segment in seconds
    """

    # Calculate hop_length using n_fft and the overlap percentage
    hop_length = int(n_fft * (1 - overlap))
    overlap_percent = int(overlap * 100)

    # Calculate the duration of each segment in milliseconds for the filename
    duration_ms = int(segment_duration_s * 1000)

    # Create a filename based on the parameters
    '''
    # multiple
    npz_filename = f"esc50_mfcc_n{num_mfcc}_fft{n_fft}_ovlp{overlap_percent}_seg{duration_ms}ms_new.npz"
    '''
    # single
    npz_filename = f"esc50_mfcc_n{num_mfcc}_fft{n_fft}_ovlp{overlap_percent}_seg{duration_ms}ms_natural.npz"

    output_path = os.path.join(OUTPUT_DIR_NPZ, npz_filename)
    print(f"Starting job for: {npz_filename}")

    # Prepare "Jobs" for paralell processing
    jobs = []
    for index, row in dataframe.iterrows():
        file_path = os.path.join(AUDIO_DIR, row['filename'])
        label = row['target_label']
        # passed as a tupple
        jobs.append((file_path, label))

    # Run Parallel Processing
    results = Parallel(n_jobs=-1, verbose=1)(
        delayed(process_audio_file)(
            file_path, label, num_mfcc, n_fft, hop_length, segment_duration_s
        ) for file_path, label in jobs
    )

    # Collect Results
    # Use .extend() since jobs return a list of segments
    final_mfccs = []
    final_labels = []
    for mfcc_list, label_list in results:
        final_mfccs.extend(mfcc_list)
        final_labels.extend(label_list)

    # Save to .npz
    '''
    np.savez_compressed(output_path,
                      mfcc=final_mfccs,
                      labels=final_labels,
                      mapping=['no_water', 'pouring_water', 'toilet_flush', 'water_drops'])
    '''
    np.savez_compressed(output_path,
                      mfcc=final_mfccs,
                      labels=final_labels,
                      mapping=['rain', 'sea_waves', 'crackling_fire', 'crickets', 'chirping_birds',
                               'water_drops', 'wind', 'pouring_water', 'toilet_flush', 'thunderstorm'])

    print(f"Successfully saved data to: {output_path}\\n")

In [None]:
save_features_to_npz2(df_balanced,
                     num_mfcc=64,
                     n_fft=2048,
                     overlap=0.5,
                     segment_duration_s=1)

Starting job for: esc50_mfcc_n64_fft2048_ovlp50_seg1000ms_natural.npz


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    8.8s finished


Successfully saved data to: /content/drive/My Drive/Summer 2025/Python Practice/audio/npz_files/esc50_mfcc_n64_fft2048_ovlp50_seg1000ms_natural.npz\n
