In [1]:
import pickle
import numpy as np
import pandas as pd

import librosa

from IPython import display
import matplotlib.pyplot as plt

In [None]:
data_tp = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_tp.csv')
data_fp = pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_fp.csv')

# Let's try to load a single `flac` file and generate Mel Spectrogram From It
* load from flac file
    * targets are all the same bitrate, convenient
* get the slice of audio annotated from in `data`
    * sampling rate is 48k - i.e. each second of audio corresponds to 48k numbers in the array
    * to get the t second point, start at the (48k * t)th number
* generate the mel spectrogram

In [None]:
SAMPLING_RATE = 48000
F_MAX = max(data_tp['f_max']) * 0.9
F_MIN = min(data_tp['f_min']) * 1.1

window_size = 512
between_window = 256
power = 1.5

In [None]:
data_tp

In [None]:
test_file_name = '/kaggle/input/rfcx-species-audio-detection/train/' + data_tp['recording_id'][0] + '.flac'
wav, sr = librosa.load(test_file_name)
print("sr =", sr)

t_begin, t_end = data_tp['t_min'][0], data_tp['t_max'][0]
clipped_wav = wav[int(t_begin*sr) : int(t_end*sr)]
print("t_begin = {}, t_end = {}".format(t_begin, t_end))

ms = librosa.feature.melspectrogram(
    clipped_wav, sr = sr, n_fft = window_size, hop_length = between_window, power = power
)

print("ms.shape = {}".format(ms.shape))
# ms.shape = (n_mels, len(clipped_wav)/between_window)

In [None]:
display.display(display.Audio(clipped_wav, rate=sr))

In [None]:
plt.figure(0)
ms_norm = ms/ms.max()
plt.imshow(ms_norm.T)
plt.title("Normalized Spectogram");

plt.figure(1)
ms_norm_per_window = ms/ms.max(axis=1)[..., None]
plt.imshow(ms_norm_per_window.T)
plt.title("Normalized Spectrum for Each Time Window");

## How long are the clips?

In [None]:
plt.hist(data_tp['t_max'] - data_tp['t_min'], bins=50);

# Process Each Image and Save as Mel Spectrogram

In [None]:
SAMPLING_RATE = 48000
F_MAX = max(data_tp['f_max']) * 0.8
F_MIN = min(data_tp['f_min']) * 1.2
HEIGHT, WIDTH = (224, 512) # I consistently see people reshaping to this size in notebooks; no idea why
clip_length = 3.5 # always try to get a clip of this long in seconds, centered around the given interval

window_size = 1024
between_window = 512
power = 1

In [None]:
from scipy.ndimage.interpolation import zoom

def to_ms(file_in, t_begin, t_end, plot=False):
    wav, sr = librosa.load(file_in)
    midpoint = int((t_begin + t_end)*sr/2)
    
    clipped_wav = wav[max(int(midpoint - clip_length*sr/2),0):int(midpoint+clip_length*sr/2)]

    ms = librosa.feature.melspectrogram(
        clipped_wav, sr = sr,
        n_fft = window_size, hop_length = between_window,
        power = power, fmin = F_MIN, fmax = F_MAX
    )
    
    # normalize ms
    ms = ((ms - ms.min())/(ms.max() - ms.min())).T
    
    # resize ms
    # scipy uses spline interpolation as opposed to skimage.transform.resize that takes a local average
    h,w = ms.shape
    ms = zoom(ms, (HEIGHT/h, WIDTH/w))
    assert ms.shape == (HEIGHT, WIDTH)

    if plot:
        plt.imshow(ms)
        
    return ms

In [None]:
i = 32
file_name = '/kaggle/input/rfcx-species-audio-detection/train/' + data_tp['recording_id'][i] + '.flac'
t_begin, t_end = data_tp['t_min'][i], data_tp['t_max'][i]

mss = to_ms(file_name, t_begin, t_end, plot=True)

In [None]:
specs = []
OUTPUT = '/kaggle/working/spectrograms.pkl'

for i, row in data_tp.iterrows():
    ms = to_ms(
        '/kaggle/input/rfcx-species-audio-detection/train/' + row['recording_id'] + '.flac',
        row['t_min'],
        row['t_max']
    )
    
    specs.append((ms, row['species_id'], i))
    
    if i % 10 == 0:
        print('{}/{} ({}%) processed'.format(i, len(data_tp), i/len(data_tp)))
        
        
    if i % 100 == 0:
        with open(OUTPUT, 'wb') as f:
            pickle.dump(specs, f)

# Simple DataLoader

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms

In [8]:
BATCH_SIZE = 500

class CustomTensorDataset(Dataset):
    def __init__(self, tensors, transform=None):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]
        if self.transform:
            x = self.transform(x)

        y = self.tensors[1][index]
        
        return x, y

    def __len__(self):
        return self.tensors[0].size(0)

# Loading Data

In [3]:
with open('spectrograms.pkl', 'rb') as f:
    spectrograms = pickle.load(f)
    
len(spectrograms)

In [14]:
# spectrograms is a list of tuples:
#    (spectrogram [np.Array], class (int), true positive dataframe index (int))
X_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

X = torch.tensor(np.array([tup[0] for tup in spectrograms]))
Y = torch.tensor(np.array([tup[1] for tup in spectrograms]))

In [28]:
d = CustomTensorDataset(tensors = (X,Y), transform=X_transform)
dl = DataLoader(d, batch_size = BATCH_SIZE)