In [2]:
# Feature extraction for sound analysis of birdsong
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa

from scipy.io import wavfile
import scipy.signal as signal
import IPython.display as ipd

In [None]:
sound_file = 'Data/2023_03_31_7_03_30.wav'
fs, audio = wavfile.read(sound_file)
t_audio = np.linspace(0, len(audio)/fs, num=len(audio))

# Bandpass filter the audio signal
nyq = 0.5 * fs
low = 300 / nyq
high = 20000 / nyq
order = 5
b, a = signal.butter(order, [low, high], btype='band')

audio_clip = audio
ipd.Audio(sound_file)

In [4]:
from scipy.fft import fft, ifft, fftfreq, fftshift
window_length = 1323
hop_length = 163

def calculate_features(x, window_length, hop_length, fs, num_tapers=2):
    tapers = signal.windows.dpss(window_length, 1.5, 2)
    size = len(x)
    f_notShifted = fftfreq(window_length, 1/fs)
    f = fftshift(f_notShifted)
    f_index = f > 0

    sonogram = np.zeros((f_index.sum(), np.floor(size / hop_length).astype(int)))
    freq_deriv = np.zeros((f_index.sum(), np.floor(size / hop_length).astype(int)))
    time_deriv = np.zeros((f_index.sum(), np.floor(size / hop_length).astype(int)))

    goodness_of_fit = np.zeros(np.floor(size / hop_length).astype(int))
    frequency_modulation = np.zeros(np.floor(size / hop_length).astype(int))
    spectral_derivative = np.zeros((f_index.sum(), np.floor(size / hop_length).astype(int)))
    entropy = np.zeros(np.floor(size / hop_length).astype(int))
    amplitude = np.zeros(np.floor(size / hop_length).astype(int))

    wav_smp = np.arange(size-window_length, step=hop_length).astype(int)
    t = np.arange(np.floor(size / hop_length)) *(hop_length/fs)

    for i in range(len(wav_smp)):
        samps = np.arange(wav_smp[i], np.floor(wav_smp[i] + window_length).astype(int))
        window1 = x[samps] * tapers[0]
        window2 = x[samps] * tapers[1]

        # If the window has values, calculate the cepstrum
        if(window1.any()):
            real_cepstrum = fftshift(np.real(ifft(np.log10(fft(window1)))))
            goodness_of_fit[i] = np.max(real_cepstrum[f_index])
        else:
            goodness_of_fit[i] = 0
        
        powSpect1 = fftshift(fft(window1))
        powSpect2 = fftshift(fft(window2))

        r1 = (np.abs(powSpect1) + np.abs(powSpect2))**2
        sonogram[:,i] = r1[f_index]

        # Getting time and frequency derivatives
        fR1 = np.real(powSpect1[f_index])
        fi1 = np.imag(powSpect1[f_index])
        fR2 = np.real(powSpect2[f_index])
        fi2 = np.imag(powSpect2[f_index])

        time_deriv[:,i] = -fR1*fR2 - fi1*fi2
        freq_deriv[:,i] = fi1*fR2 - fR1*fi2

        # Getting frequnecy modulation
        frequency_modulation[i] = np.arctan((np.max(time_deriv[:,i])/np.max(freq_deriv[:,i]))+0.1)

        # Solving for spectral derivatives
        cFM = np.cos(frequency_modulation[i])
        sFM = np.sin(frequency_modulation[i])
        spectral_derivative[:,i] = time_deriv[:,i].dot(cFM) + freq_deriv[:,i].dot(sFM)

        # Compute entropy
        sumLog = np.sum(np.log(sonogram[10:,i])) / (f_index.sum()-10)
        sumSon = np.sum(sonogram[10:,i]) / (f_index.sum()-10)
        
        # Same as -log(sumLog / sumSon)
        entropy[i] = (np.log(sumSon - sumLog) / np.log2(f_index.sum() - 10))-1

        # Amplitude
        amplitude[i] = -10*np.log(sonogram[10:,i]).sum()

    return t, f[f_index], sonogram, goodness_of_fit, frequency_modulation, spectral_derivative**2, entropy, amplitude

t, f, Sxx, gof, fm, sd, ent, amp = calculate_features(audio_clip, window_length, hop_length, fs)

In [24]:
# Only include Sxx indices where the ent value is above 0.4
sounds = sd[:,ent >= 0.4]

152419 1133483


In [None]:
import umap

clusterable_embedding = umap.UMAP(
        n_neighbors=50,
        min_dist=0.5,
        n_components=2,
        metric='canberra',
        n_jobs=-1,
        ).fit_transform(sounds.T)

plt.figure(figsize=(10,10))
plt.scatter(clusterable_embedding[:, 0],
            clusterable_embedding[:, 1],
            s=1,
            alpha=0.1,
            color='black')

In [None]:
# Make a video over the umap embedding, where the point on the graph changes color when it is played in the audio
# Variables used: t, clusterable_embedding, ent

# Make the graphic 30 fps
fps = 30
video_t = np.arange(0, t[-1], 1/fps)
num_frames = 30*60

idx_buffer = []
buffer_counter = []
buffer_thresh = 10

for i in range(num_frames): #range(len(video_t)):
    # Find the index of the closest point to the current time
    plt.scatter(clusterable_embedding[:, 0],
                clusterable_embedding[:, 1],
                color='black',
                s=1,
                alpha=0.1)

    idx = np.argwhere(np.abs(t - video_t[i]) < 1/fps)
    idx = [x in idx if ent[x] >= 0.4]

    idx_buffer.append(idx)
    buffer_counter.append(np.zeros(len(idx)))

    plt.scatter(clusterable_embedding[idx, 0],
                clusterable_embedding[idx, 1],
                s=5,
                color='blue')
    plt.ylim([0, 20])
    plt.xlim([-7.5, 15])
    plt.axis('off')
    plt.savefig('Data/umap_video/' + str(i) + '.png')
    plt.clf()

#     idx_buffer = idx_buffer[old_buffer]
#     buffer_counter = buffer_counter[old_buffer]


In [22]:
# Take the images from the umap_video folder and make a video
import cv2

img_array = []
for i in range(num_frames):
    filename = 'Data/umap_video/' + str(i) + '.png'
    img = cv2.imread(filename)
    height, width, layers = img.shape
    size = (width,height)
    img_array.append(img)

out = cv2.VideoWriter('Data/umap_video/umap_video.avi', cv2.VideoWriter_fourcc(*'DIVX'), 30, size)

# Save video
for i in range(len(img_array)):
    out.write(img_array[i])

out.release()