### This file extracts audio files from directories.
1. The audio files are stored in different folders with their respective class name.
2. The functions will extract melspectrogram and mfcc from each audio files
3. For alignment, the audio files are sliced according the `seg size`(in this sample,seg=240).
4. If the segmentation has sample size less than `seg size` but more than half of it, it will be padded with 0s. Else erased.

In [52]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt


In [53]:
np.__version__

'1.24.4'

In [54]:
root_dir = "/data2/xiangrui_d2/MMC/Dataset/Watkins_full"
seg = 256
n_mels = 256
sample_rate = 8000
n_fft=512
hop_length=256
save_dir = "/data2/xiangrui_d2/MMC/Dataset/features/watkins_full/13c_8k_256_256"

if os.path.exists(save_dir):
    pass
else:
    os.mkdir(save_dir)

In [55]:
label_name={}

In [56]:
for i,value in enumerate(os.listdir(root_dir)):
    label_name[i]=value
label_name

{0: 'West Indian Manatee',
 1: 'Beluga, White Whale',
 2: 'Gray Whale',
 3: 'Common Dolphin',
 4: 'Walrus',
 5: 'Harp Seal',
 6: 'Finback Whale',
 7: 'Steller Sea Lion',
 8: "Heaviside's Dolphin",
 9: 'Killer Whale',
 10: 'Southern Right Whale',
 11: 'Hooded Seal',
 12: 'Clymene Dolphin',
 13: 'Irawaddy Dolphin',
 14: 'Narwhal',
 15: 'Striped Dolphin',
 16: 'Atlantic Spotted Dolphin',
 17: 'Northern Right Whale',
 18: 'Harbor Porpoise',
 19: "Dall's Porpoise",
 20: 'Bottlenose Dolphin',
 21: 'Gray Seal',
 22: 'Finless Porpoise',
 23: 'Minke Whale',
 24: 'sperm whale',
 25: 'Long Beaked (Pacific) Common Dolphin',
 26: 'Ross Seal',
 27: 'Harbour Seal',
 28: 'Pantropical Spotted Dolphin',
 29: 'Ringed Seal',
 30: 'Spotted Seal',
 31: 'Ribbon Seal',
 32: 'Bowhead Whale',
 33: 'New Zealand Fur Seal',
 34: 'False Killer Whale',
 35: 'Leopard Seal',
 36: 'Sea Otter',
 37: 'Juan Fernandez Fur Seal',
 38: 'White-beaked Dolphin',
 39: 'Melon Headed Whale',
 40: 'Short-Finned (Pacific) Pilot Whal

In [57]:
# keeplist = [1.0, 3.0, 4.0, 40.0, 21.0, 6.0, 2.0, 24.0, 5.0, 16.0, 7.0, 47.0, 33.0]

# print(len(keeplist))
# import csv
# class_limited = []
# for i in keeplist:
#     class_limited.extend([label_name[i]])
# print(class_limited)

In [1]:
class_limited = ['sperm whale','Finback Whale','Humpback Whale','Killer Whale','Short-Finned (Pacific) Pilot Whale','Long-Finned Pilot Whale','Pantropical Spotted Dolphin','Spinner Dolphin','Common Dolphin','Bottlenose Dolphin','Weddell Seal','Bowhead Whale']

In [5]:
import os 
source = '/data2/xiangrui_d2/MMC/Dataset/Watkins_full'
dest = '/data2/xiangrui_d2/MMC/Dataset/Waktins12/wav/original'
for i in os.listdir(source):
    if i in class_limited:
        os.system(f"cp -r '{source}/{i}' '{dest}' ")

In [59]:
def slicing_and_padding_single_data(data,seg=30):
    """_summary_

    Args:
        data_list (_type_): A data list that contains 2D arrays data.
        seg (int, optional): The length of each sliced data. Defaults to 30.
    """
    new_data_list = []

    data_length = data.shape[0] # The length of original 2D data sample.
    seg_counter = 0
    while data_length > 0:
        if data_length>=seg:

            new_data_list.append(data[seg_counter:seg_counter + seg,:])
            data_length -= seg
            seg_counter += seg

        elif data_length >= seg/2:  #when the left data_length is less than seg size but large than half of seg size, pad it up to seg size
            
            # pad the sample mean
            # average = data.mean(axis=0)
            # missing_length = seg-data_length
            # averages_pads = np.tile(average,(missing_length,1))
            # padded_data = np.vstack((data[seg_counter:,:],averages_pads))
            # print(padded_data.shape)
        
            #pad 0s 
            pad = np.zeros((seg-data_length,data.shape[1]))
            padded_data = np.vstack((data[seg_counter:,:],pad))

            # print(padded_data.shape)
            new_data_list.append(padded_data)
            break
        else: #when the row number is less than half of the segment , pass the rows.
            break
    return new_data_list


In [60]:
def batch_feature_extraction(dir,sample_rate,seg=seg,):
    """_summary_

    Args:
        dir (_type_): Root folder directory that contains differents classes' folders.

    Returns:
        _type_: spectrogram list, mfcc list
    """
 
    # Save the folders' directories to a list according to the classes.
    
    folder_list = []
    class_list = []
    counter = 0
    for root,folders,files in os.walk(dir):
        for folder in folders:

            if folder not in class_limited:
                continue
            else:
                counter += 1
                print('Find Class folder ', folder)      
                folder_list.append(os.path.join(root,folder))
                class_list.append(folder)
                
    print('Find Class folder in the limited list: ', counter)  
    print(class_list)
    for folder_dir,class_name in zip(folder_list,class_list):
        
        specs = []
        # mfccs = []
        print('going through folder',folder_dir)
        for file in os.listdir(folder_dir):
            
            if file.endswith(".csv"):
                continue

            audio_name = file
            audio_dir = os.path.join(folder_dir,audio_name)
            # print(audio_dir)

            y, sr = librosa.load(audio_dir,sr=sample_rate) #default sample rate 22050
            if len(y)<2048:
                print('The length of audio is less than 2048 samples ',audio_dir)
                continue
            
            # Calculate spectrogram
            spec = librosa.feature.melspectrogram(y=y,sr=sr,n_mels=n_mels,n_fft=n_fft,hop_length=hop_length)
            spec = librosa.power_to_db(spec)
            spec = spec.T
            # print(spec.shape)
            modified_spec = slicing_and_padding_single_data(spec,seg=seg)
            specs.extend(modified_spec)           
        

#-------------------------------------------------------------------------------------------
# Assert that all the data has correct shape
        print('There are {} sliced and padded data samples under \n{}\n'.format(len(specs),folder_dir))
        
        for i in specs:
            assert i.shape[0]==seg,print(i.shape)

#-------------------------------------------------------------------------------------------
# Save the data into csv files to respective directories
        specs = np.array(specs)

        # reshaped_specs = specs.reshape(specs.shape[0],-1)
        current_labels = np.full((specs.shape[0],1),class_name)
        np.savez(os.path.join(save_dir,class_name+'_specs.csv'),specs,current_labels)
        # np.savetxt(os.path.join(folder_dir,'specs.csv'), reshaped_specs, delimiter=",")


batch_feature_extraction(root_dir,sample_rate=sample_rate,seg=seg)

Find Class folder  Common Dolphin
Find Class folder  Finback Whale
Find Class folder  Killer Whale
Find Class folder  Bottlenose Dolphin
Find Class folder  sperm whale
Find Class folder  Pantropical Spotted Dolphin
Find Class folder  Bowhead Whale
Find Class folder  Short-Finned (Pacific) Pilot Whale
Find Class folder  Spinner Dolphin
Find Class folder  Weddell Seal
Find Class folder  Long-Finned Pilot Whale
Find Class folder  Humpback Whale
Find Class folder in the limited list:  12
['Common Dolphin', 'Finback Whale', 'Killer Whale', 'Bottlenose Dolphin', 'sperm whale', 'Pantropical Spotted Dolphin', 'Bowhead Whale', 'Short-Finned (Pacific) Pilot Whale', 'Spinner Dolphin', 'Weddell Seal', 'Long-Finned Pilot Whale', 'Humpback Whale']
going through folder /data2/xiangrui_d2/MMC/Dataset/Watkins_full/Common Dolphin
The length of audio is less than 2048 samples  /data2/xiangrui_d2/MMC/Dataset/Watkins_full/Common Dolphin/58014005.wav
The length of audio is less than 2048 samples  /data2/xia

In [61]:
def merge(dir):
    merged_X = np.zeros((1,seg,n_mels))
    merged_y = np.zeros((1,1))

    for file in os.listdir(save_dir):
        curr_npy = os.path.join(save_dir,file)
        # print(curr_npy)
        curr_data = np.load(curr_npy)
        X = curr_data['arr_0']
        y = curr_data['arr_1']
        if X.shape[0] ==0:
            continue
        merged_X = np.vstack((merged_X,X))
        merged_y = np.vstack((merged_y,y))
    merged_X = np.delete(merged_X,0,0)
    merged_y = np.delete(merged_y,0)
    print(merged_X.shape)
    print(merged_y.shape)
    return merged_X,merged_y

In [62]:
merged_X,merged_y = merge(save_dir)


(9346, 256, 256)
(9346,)


In [63]:
np.save(f"{save_dir}/overall",merged_X,merged_y)

In [64]:
data = np.load(f"{save_dir}/overall.npy")

In [65]:
from collections import Counter
yy = merged_y.tolist()
ct = Counter(yy)

In [66]:
ct

Counter({'sperm whale': 5063,
         'Finback Whale': 1691,
         'Humpback Whale': 811,
         'Short-Finned (Pacific) Pilot Whale': 318,
         'Killer Whale': 307,
         'Spinner Dolphin': 207,
         'Bottlenose Dolphin': 195,
         'Weddell Seal': 184,
         'Pantropical Spotted Dolphin': 157,
         'Common Dolphin': 139,
         'Bowhead Whale': 139,
         'Long-Finned Pilot Whale': 135})

In [67]:
def merge_and_labels(root_dir,seg):
    folders = os.listdir(root_dir)
    labels = []
    counter = 0
    collected_specs = np.empty((1,seg,240))
    collected_mfccs = np.empty((1,seg,40))
    collected_y = np.empty((1,1))
    for folder in folders:
        if folder.endswith('csv'):
            continue 
        if folder not in class_limited:
            continue
        folder_dir = os.path.join(root_dir,folder)
        
        specs = np.loadtxt(os.path.join(folder_dir,"specs.csv"), delimiter=",")
        mfccs = np.loadtxt(os.path.join(folder_dir,"mfccs.csv"), delimiter=",")

        # Restore data to 3D
        specs = specs.reshape(-1,seg,240)
        mfccs = mfccs.reshape(-1,seg,40)
        y = np.full((specs.shape[0],1),counter)

        # Stack them up
        collected_specs = np.vstack((collected_specs,specs))
        collected_mfccs = np.vstack((collected_mfccs,mfccs))
        collected_y = np.vstack((collected_y,y))
        
        labels.append((folder,counter))

        counter += 1

    # Remove the empty sample 0 (created for initialization)    
    collected_specs = np.delete(collected_specs,0,0)
    collected_mfccs = np.delete(collected_mfccs,0,0)
    collected_y = np.delete(collected_y,0,0)


    print(labels) 
    
    print('\nThe overall mfcc data has shape of ',collected_mfccs.shape)

    print('\nThe overall spectrogram data has shape of ',collected_specs.shape)
    print('\nThe labels have shape of ',collected_y.shape)

    return collected_specs,collected_mfccs,collected_y

# collected_specs,collected_mfccs,collected_y = merge_and_labels(root_dir,seg)