In [9]:
# %pip install quilt3[pyarrow]==5.3.1
# %pip install librosa
# %pip install azure-storage-blob
# %pip install python-dotenv

In [1]:
import os
import quilt3
import pandas as pd
import numpy as np
from dotenv import load_dotenv

import librosa
import soundfile as sf
import functools

from azure.storage.blob import BlobServiceClient

import matplotlib.pyplot as plt

import tqdm as notebook_tqdm

In [2]:
def list_folders(directory):
    folders = []
    with os.scandir(directory) as entries:
        for entry in entries:
            if entry.is_dir():
                folders.append(entry.name)
    return folders


def list_files(directory, extension):
    files = []
    for name in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, name)):
            if name.endswith(extension):
                files.append(name)
    return files

In [3]:
FILESHARE_PATH = "/mnt/humpbackwhales/"

# Download labeled data from Amazon S3

In [4]:
data_download_folder = f"{FILESHARE_PATH}/data"
annotations_path = f"{data_download_folder}/raw/annotations"
audio_path = f"{data_download_folder}/raw/audio"
extracted_calls_path = f"{data_download_folder}/preprocessed"
output_clean_annotations = f"{FILESHARE_PATH}/data/preprocessed/annotations_clean.csv"

In [13]:
# connect to a public AWS S3 bucket
b = quilt3.Bucket("s3://acoustic-sandbox")

# download annotation files
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/Annotations/", f"{data_download_folder}/raw/annotations/")

# download audio data
b.fetch("humpbacks/Emily-Vierling-Orcasound-data/Em_HW_data/flac_files/", f"{data_download_folder}/raw/audio/")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76.8k/76.8k [00:02<00:00, 26.8kB/s]


In [18]:
isExist = os.path.exists(extracted_calls_path)
if not isExist:
    os.makedirs(extracted_calls_path)

# Extract humpback whales vocalizations from raw audio files

In [5]:
annotation_filenames = list_files(annotations_path, ".txt")
audio_filenames = list_files(audio_path, ".flac")

print(annotation_filenames[1])
df = pd.read_csv(f"{annotations_path}/{annotation_filenames[1]}", sep="\t")
df.head(10)

OS_10_03_2021_19_34_00_.Table.1.selections.txt


Unnamed: 0,Selection,Begin Time (s),End Time (s),Low Freq (Hz),High Freq (Hz),Call Type
0,1,1646.999571,1648.733984,628.263,1297.059,Ascending moan
1,2,1653.223452,1654.641001,749.862,1134.926,Moan
2,3,1659.862135,1660.595925,770.129,1033.594,Moan
3,4,1661.796673,1663.747887,283.732,709.329,Ascending moan
4,5,1678.344185,1680.262045,506.664,1013.327,Moan
5,6,1684.197828,1687.39982,303.998,1155.193,Descending moan
6,7,1689.378332,1689.628487,2229.32,2452.252,Chirp
7,8,1691.546347,1693.180698,709.329,1114.66,Ascending moan
8,9,1695.532161,1696.599492,162.132,466.131,Growl
9,10,1756.907157,1758.291352,729.596,993.061,Ascending moan


In [18]:
df.shape

(16, 6)

In [19]:
annotation_filenames

['211026-133018-OS-humpback-47min-clip.Table.1.selections.txt',
 'OS_10_03_2021_19_34_00_.Table.1.selections.txt',
 'OS_10_28_2021_18_54_00_.Table.1.selections.txt',
 'OS_10_28_2021_1900_HB.Table.1.selections.txt',
 'OS_10_28_2021_19_24_00_.Table.1.selections.txt',
 'OS_10_28_2021_19_55_00_.Table.1.selections.txt',
 'OS_10_28_2021_20_25_00_HB.Table.1.selections.txt']

In [20]:
audio_filenames

['211026-133018-OS-humpback-47min-clip.flac',
 'OS_10_03_2021_19_34_00_.flac',
 'OS_10_28_2021_18_54_00_.flac',
 'OS_10_28_2021_1900_HB.flac',
 'OS_10_28_2021_19_24_00_.flac',
 'OS_10_28_2021_19_55_00_.flac',
 'OS_10_28_2021_20_25_00_HB.flac']

In [21]:
df["Call Type"].value_counts()

Call Type
Ascending moan     4
Descending moan    4
Moan               3
Whup               2
Chirp              1
Growl              1
Piccalo            1
Name: count, dtype: int64

In [8]:
x, sr = librosa.load(f"{audio_path}/{audio_filenames[0]}", sr=None)
print(f"Original sample rate of lossless recordings (*.flac): {sr}")

Original sample rate of lossless recordings (*.flac): 44100


In [6]:
def clean_calltype(series):
    replacements = {
        "Asceding_moan": "Ascending_moan", "Ascening_moan": "Ascending_moan",
        "Gunt": "Grunt",
        "Chrip": "Chirp",
        "whup": "Whup", "Whuo": "Whup"
    }
    return series.str.replace(' ', '_').replace(replacements)      

In [10]:
all_annotations = []
for audio_file_name in audio_filenames:
    x, sr = librosa.load(f"{audio_path}/{audio_file_name}", sr=None)  # sr = None means that we use original sample rate
    print(f"Processing {audio_path}/{audio_file_name}")
    annotation_file_name = audio_file_name.replace(".flac",".Table.1.selections.txt")
    df = pd.read_csv(f"{annotations_path}/{annotation_file_name}", sep="\t")
    # fix some misspellings of vocalization names
    df["Call Type"] = clean_calltype(df["Call Type"])
    for index, row in df.iterrows():
        selection = row["Selection"]
        start_time = row["Begin Time (s)"]
        end_time = row["End Time (s)"]
        lowcut = row["Low Freq (Hz)"]
        highcut = row["High Freq (Hz)"]
        call_type = row["Call Type"]
        
        # convert time to sample index
        start_sample = librosa.time_to_samples(start_time, sr=sr)
        end_sample = librosa.time_to_samples(end_time, sr=sr)

        # extract the sample
        extracted_sample = x[start_sample:end_sample]

        # save the extracted sample to a new file
        isExist = os.path.exists(f"{extracted_calls_path}/{call_type}")
        if not isExist:
            os.makedirs(f"{extracted_calls_path}/{call_type}")
            
        afn = audio_file_name.replace(".flac","")
            
        path = f"{extracted_calls_path}/{call_type}/{afn}_{selection}.wav"
        sf.write(path, extracted_sample, sr)

    df['Filename'] = audio_file_name
    all_annotations.append(df)

Processing /mnt/humpbackwhales//data/raw/audio/211026-133018-OS-humpback-47min-clip.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_03_2021_19_34_00_.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_28_2021_18_54_00_.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_28_2021_1900_HB.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_28_2021_19_24_00_.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_28_2021_19_55_00_.flac
Processing /mnt/humpbackwhales//data/raw/audio/OS_10_28_2021_20_25_00_HB.flac


In [11]:
all_annotations_pd = pd.concat(all_annotations).rename(columns={
    'Begin Time (s)': 'BeginTime', 'End Time (s)': 'EndTime',
    'Low Freq (Hz)': 'LowFreq', 'High Freq (Hz)': 'HighFreq', 'Call Type': 'CallType'
})

print(f'Total records: {len(all_annotations_pd)}')

all_annotations_pd['Duration'] = all_annotations_pd.EndTime - all_annotations_pd.BeginTime  # In seconds

all_annotations_pd.head()

Total records: 1514


Unnamed: 0,Selection,BeginTime,EndTime,LowFreq,HighFreq,CallType,Filename,Duration
0,1,245.992197,247.366463,263.518,2845.995,Descending_moan,211026-133018-OS-humpback-47min-clip.flac,1.374266
1,2,253.276801,257.038976,263.518,1791.923,Descending_moan,211026-133018-OS-humpback-47min-clip.flac,3.762175
2,3,259.368406,259.836241,2898.699,3531.142,Whistle,211026-133018-OS-humpback-47min-clip.flac,0.467835
3,4,262.101362,264.849895,158.111,1844.626,Descending_moan,211026-133018-OS-humpback-47min-clip.flac,2.748532
4,5,266.721236,267.578934,1106.776,3056.809,Whistle,211026-133018-OS-humpback-47min-clip.flac,0.857698


In [12]:
all_annotations_pd.to_csv(output_clean_annotations, index=False)

In [18]:
path = f"{extracted_calls_path}/Ascending_moan"
len(list_files(path, ".wav"))

# Upload vocalization files into Azure Blob storage

To upload audio files into Azure Blob Storage, create a file local.env with the following two lines: 

- AZURE_BLOB_CONNECTION_STRING = "to becopied from Azure portal"
- AZURE_BLOB_CONTAINER_NAME = "existing container in Blob storage where files should be uploaded, e.g. azureml"

In [105]:
# load environment variables from local.env file
load_dotenv(dotenv_path="local.env")

connection_string = os.getenv("AZURE_BLOB_CONNECTION_STRING")
container_name = os.getenv("AZURE_BLOB_CONTAINER_NAME") # Name of container where files should be uploaded

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
blob_container_client = blob_service_client.get_container_client(container_name)

local_source_files_folder = data_download_folder

for root, dirs, files in os.walk(local_source_files_folder):
    for file in files:
        file_path = os.path.join(root, file)
        blob_name = os.path.relpath(file_path, local_source_files_folder).replace("\\", "/")
        blob_client = blob_container_client.get_blob_client(blob_name)
        
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data)