In [1]:
import glob, os
import pandas as pd
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_INPUT_PATH = "../data/orginal/"
DATA_OUTPUT_PATH = "../data/converted/"

In [3]:
def apply_begin_time(labels, offsets):
    labels['Relative Begin Time (s)'] = labels.apply (
        lambda row: row['Begin Time (s)'] - offsets[row['Begin File']],
        axis=1)

    labels['Relative End Time (s)'] = labels.apply (
        lambda row: row['End Time (s)'] - offsets[row['Begin File']],
        axis=1
    )
    
    return labels

    
def count_offsets(wavs):
    total_offset = 0.0
    offsets = {}
    for wav in wavs:
        name = os.path.basename(wav)
        duration = librosa.get_duration(filename=wav)
#         print(f"{name}: {total_offset:.4f}s")
        offsets[name] = total_offset
        total_offset += duration
    return offsets
        
    
def reformat_labels(labels, wavs):
    labels.loc[labels['USV TYPE'] == '22-kHz call', 'USV TYPE'] = '22kHz'
    labels.loc[labels['USV TYPE'] == '22-kHz', 'USV TYPE'] = '22kHz'
    
    offsets = count_offsets(wavs)
    labels = apply_begin_time(labels, offsets)
    
    return labels
    

def load_labels(input_path):
    return pd.read_csv(input_path + 'dlaUJ_VPAmodel_koh3_tickling_20190328.txt', delimiter="\t", index_col='Selection')


In [4]:
from types import SimpleNamespace

def freq_to_fft_bucket(freq, n_fft, sr):
    return (n_fft//2 + 1) * freq / (sr / 2)

def get_audio(path):
    if path not in get_audio.cache:
        get_audio.cache[path] = librosa.load(path, sr=None)
    return get_audio.cache[path]
get_audio.cache = {}

def extract_audio_frames(audio_name, data, n_fft=512, hop_length=128, trim_frames=True, verbose=False, to_remove_pivot=-0.7):
    audio_data, sr = get_audio(f"{DATA_INPUT_PATH}/{audio_name}")
    
    def t2f(time):
        return librosa.core.time_to_frames(time, sr=sr, hop_length=hop_length, n_fft=n_fft)

    stft = librosa.stft(audio_data, n_fft=n_fft, hop_length=hop_length)
    stft = np.abs(np.moveaxis(stft, 0, -1))
    
    frames_labels = np.array([None]*stft.shape[0])
    for idx, row in data[data['Begin File'] == audio_name].iterrows():
        # WARNING: add some offset to range boundaries ???
        
        start = t2f(row['Relative Begin Time (s)'])
        end = t2f(row['Relative End Time (s)'])
        
        low = int(freq_to_fft_bucket(row['Low Freq (Hz)'], n_fft=n_fft, sr=sr))
        high = int(freq_to_fft_bucket(row['High Freq (Hz)'], n_fft=n_fft, sr=sr)) + 1
        
        start_zero = start
        end_zero = end
            
        if trim_frames:
            max_elem_in_kolumn = np.max(stft[start:end, low:high], axis=1)
            normalized_max = (max_elem_in_kolumn - np.mean(max_elem_in_kolumn)) / np.std(max_elem_in_kolumn)

            
            while normalized_max[start - start_zero] < to_remove_pivot:
                start += 1

            while normalized_max[end - start_zero - 1] < to_remove_pivot:
                end -= 1

            if verbose:
                sns.lineplot(range(start_zero, end_zero), max_elem_in_kolumn, label="max_elem_in_kolumn")
                sns.lineplot(range(start_zero, end_zero), normalized_max, label="normalized_max")                                                                            
                
        if verbose:
            plt.figure()
            plt.imshow(stft[start_zero:end_zero, low:high].T)
            plt.axvline(start - start_zero, color='red')
            plt.axvline(end - start_zero, color='red')
            plt.show()
        
        frames_labels[start:end] = row['USV TYPE']
    
    meta = dict(
        audio_name=audio_name,
        audio_length=audio_data.size,
        n_fft=n_fft,
        hop_length=hop_length,
        sampling_rate=sr,
        trim_frames=trim_frames,
    )
    
    return SimpleNamespace(
        X=stft, y=frames_labels, meta=meta
    )

In [None]:
for wav in wavs:
    data = extract_audio_frames(os.path.basename(wav), labels)
    np.savez(f"{DATA_OUTPUT_PATH}{data.meta['audio_name']}.npz", X=data.X, y=data.y, meta=data.meta)