This notebook is designed to augment audio data by extracting both "seconds of interest" and background "noise" from audio files.

In [1]:
import os
import scipy.signal as sp
import pandas as pd
import librosa
import soundfile as sf
import numpy as np

def get_len_audio(audio, sample_rate):
    # Calculate the total duration in seconds
    total_seconds = len(audio) / float(sample_rate)
    return total_seconds

def get_relevant_window(x, df_ann):
    return (np.min(np.abs(x["Start Time"]- df_ann["BeginTime"]))<final_audio_len) or (np.min(np.abs(x["End Time"] - df_ann["EndTime"]))>final_audio_len)

In [2]:
def calculate_overlap(df_annotations, interval_start, interval_end, unique_labels):
    mask = (
        (df_annotations['BeginTime'] <= interval_end) & 
        (df_annotations['EndTime'] >= interval_start)
    )
    label_overlap_durations = (
        pd.to_numeric(mask) * 
        (df_annotations['EndTime'].clip(upper=interval_end) - df_annotations['BeginTime'].clip(lower=interval_start))
    )
    label_counts = {
        label: label_overlap_durations[df_annotations['CallType'] == label].sum()
        for label in unique_labels
    }
    return [label_counts[label] for label in unique_labels]

def get_overlap_summary(df_annotations, df_intervals, unique_labels):
    
    result_columns = ['BeginTime', 'EndTime'] + list(unique_labels)
    result_df = pd.DataFrame(columns=result_columns)

    for _, row_interval in df_intervals.iterrows():
        interval_start = row_interval['Start Time']
        interval_end = row_interval['End Time']

        label_counts = calculate_overlap(df_annotations, interval_start, interval_end, unique_labels)
        
        result_row = [interval_start, interval_end] + label_counts
        result_df.loc[len(result_df)] = result_row

    return result_df.fillna(0)



In [3]:
data_download_folder = "/mnt/humpbackwhales/data"
annotations_path = f"{data_download_folder}/preprocessed/annotations_clean.csv"
audio_path = f"{data_download_folder}/raw/audio"
extracted_calls_path = f"{data_download_folder}/preprocessed/data_augmentation"

In [4]:
increment = 1 #sliding window size
final_audio_len = 5 #total secs of a window"

In [5]:
df_annotations = pd.read_csv(annotations_path)

In [6]:
list_annotations = df_annotations["Filename"].drop_duplicates().to_list()

In [7]:
file_list = []
for original_filename in list_annotations:
    print(f"Processing file {original_filename}")
    flac_file_path = f"{audio_path}/{original_filename.replace('.txt', '.flac')}"
    audio, sample_rate = librosa.load(flac_file_path, sr=44100)
    total_seconds = get_len_audio(audio, sample_rate)
    print(f"Total duration of {flac_file_path}: {total_seconds:.2f} seconds")
    
    # Create lists for "seconds start" and "seconds end"
    start = [i * increment for i in range(int((total_seconds-final_audio_len) // increment)+1)]
    end = [(i + final_audio_len) if i + final_audio_len <= total_seconds else total_seconds for i in start]

    # Create a DataFrame
    df = pd.DataFrame({'Start Time': start, 'End Time': end})
    
    #consider only relevant annotations
    df_annotations_subsample = df_annotations[df_annotations["Filename"] == original_filename].copy()
    
    # define wich are relevant windows because contain the target sound
    df["relevant_window"] = df.apply(lambda x: get_relevant_window(x, df_annotations_subsample), axis=1)
    df_subset = df[df["relevant_window"]==True].copy()
    
    print("Computing calltype percentage")
    unique_labels = df_annotations_subsample["CallType"].drop_duplicates().to_list()
    if 'Unknown_1' in unique_labels:
        unique_labels.remove('Unknown_1')
        
    result_df = get_overlap_summary(df_annotations_subsample, df_subset, unique_labels)
    
    result_df["different_labels"] = (result_df!=0).iloc[:,2:].sum(axis=1)
    result_df["interesting_seconds"] = result_df.iloc[:,2:-1].sum(axis=1) 
    #Keeping only windows with 1 calltype and between 10% and 80% interesting sound
    result_df["percentage"] = (result_df["interesting_seconds"]/final_audio_len)*100
    result_df = result_df[(result_df["percentage"]>= 10) & (result_df["percentage"]<= 80) & (result_df["different_labels"]== 1)].copy()
    
    #WRITE TO STORAGE
    for calltype in unique_labels:
        subset = result_df[result_df[calltype]!=0].copy()
        for index, row in subset.iterrows():
            start_time = row["BeginTime"]
            end_time = row["EndTime"]
            percentage = row["percentage"]

            # Convert time to sample index
            start_sample = librosa.time_to_samples(start_time, sr=sample_rate)
            end_sample = librosa.time_to_samples(end_time, sr=sample_rate)

            # Extract the sample
            extracted_sample = audio[start_sample:end_sample]

            # Save the extracted sample to a new file
            isExist = os.path.exists(f"{extracted_calls_path}")
            if not isExist:
                os.makedirs(f"{extracted_calls_path}")
            
            filename = f"{extracted_calls_path}/{original_filename.split('.')[0]}_5secs_{round(start_time)}start_{calltype}_{round(percentage)}perc.wav"
            file_list.append(filename)
            sf.write(filename, extracted_sample, sample_rate)

Processing file 211026-133018-OS-humpback-47min-clip.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/211026-133018-OS-humpback-47min-clip.flac: 2820.00 seconds
Computing calltype percentage
Processing file OS_10_03_2021_19_34_00_.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/OS_10_03_2021_19_34_00_.flac: 1800.32 seconds
Computing calltype percentage
Processing file OS_10_28_2021_18_54_00_.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/OS_10_28_2021_18_54_00_.flac: 1800.08 seconds
Computing calltype percentage
Processing file OS_10_28_2021_1900_HB.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/OS_10_28_2021_1900_HB.flac: 237.14 seconds
Computing calltype percentage
Processing file OS_10_28_2021_19_24_00_.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/OS_10_28_2021_19_24_00_.flac: 1800.08 seconds
Computing calltype percentage
Processing file OS_10_28_2021_19_55_00_.flac
Total duration of /mnt/humpbackwhales/data/raw/audio/OS_10_28_20

In [8]:
import os

def get_folder_size(folder_path):
    total_size = 0

    # Walk through all the files and subdirectories in the given folder
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)

    # Convert the size to a human-readable format
    return convert_bytes(total_size)

def convert_bytes(bytes):
    # Function to convert bytes to a human-readable format (e.g., KB, MB, GB, etc.)
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes < 1024.0:
            break
        bytes /= 1024.0
    return f"{bytes:.2f} {unit}"

if __name__ == "__main__":
    folder_path = f"{extracted_calls_path}"  # Replace with the path to your folder
    folder_size = get_folder_size(folder_path)
    print(f"Total size of '{folder_size}'")


Total size of '838.28 MB'
