In [150]:
import torch
import torch.nn as nn
import numpy as np
import librosa
import IPython.display

from src.data import NSynthDataset
from src.models import Autoencoder

In [3]:
model = Autoencoder(h_dim=128)
model.load_state_dict(torch.load('train_results/autoencoder/20200430-175135/model-6000.weights'))

<All keys matched successfully>

In [17]:
loss_fn = nn.BCELoss()

##### Real instrument

In [2]:
val_dataset = NSynthDataset(
    'music-ml-gigioli', 
    'data/nsynth/nsynth-valid', 
    instrument_source=[0], 
    feature_type='mel',
    scaling='normalize'
)

val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
features = val_dataset[0]

In [12]:
features = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
outputs = model(features)

In [19]:
loss_fn(outputs, features).item()

0.0014240677701309323

##### Noise

In [121]:
features = torch.tensor(np.random.rand(1, 1, 128, 251), dtype=torch.float32)
features = (features - features.min()) / (features.max() - features.min())

In [122]:
outputs = model(features)

In [123]:
loss_fn(outputs, features).item()

2.0569937229156494

##### Sample

In [145]:
#sample_file = '269570__vonora__cuckoo-the-nightingale-duet.wav'
#sample_file = '504900__soundmast123__restaurant-ambience.wav'
sample_file = '389684__meggiepie__pots-and-pans.wav'

In [146]:
x, sr = librosa.load(sample_file, sr=16000)

In [177]:
window_size = 4*sr
step_size = int(window_size/8)

samples = []
for i in range(int(len(x)/step_size - int(window_size/step_size) + 1)):
    samples.append(x[i*step_size:i*step_size+window_size])

In [178]:
len(samples)

42

In [179]:
scores = []
for i, s in enumerate(samples):
    features = librosa.feature.melspectrogram(y=s, sr=sr, n_fft=1024, hop_length=256)
    features = (features - features.min()) / (features.max() - features.min())
    features = torch.tensor(features, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    outputs = model(features)
    
    loss = loss_fn(outputs, features).item()
    scores.append((i, loss))
scores = sorted(scores, key=lambda x : x[1])

In [180]:
scores

[(13, 0.000837604224216193),
 (12, 0.0012548192171379924),
 (11, 0.0015360214747488499),
 (7, 0.0019142280798405409),
 (10, 0.0020305386278778315),
 (32, 0.0022692373022437096),
 (9, 0.002347674686461687),
 (31, 0.0024129075463861227),
 (8, 0.0026910528540611267),
 (27, 0.0028030863031744957),
 (30, 0.0028186531271785498),
 (6, 0.003143317298963666),
 (29, 0.0033314672764390707),
 (28, 0.0036818194203078747),
 (26, 0.004585650283843279),
 (3, 0.004614262375980616),
 (2, 0.006505002733319998),
 (14, 0.0067185587249696255),
 (24, 0.007126904558390379),
 (1, 0.007567770779132843),
 (15, 0.008465626277029514),
 (41, 0.0092714112251997),
 (0, 0.009518849663436413),
 (39, 0.009694493375718594),
 (40, 0.010723750106990337),
 (23, 0.01093742623925209),
 (34, 0.010952065698802471),
 (33, 0.011512466706335545),
 (5, 0.011850442737340927),
 (4, 0.012052669189870358),
 (25, 0.012174471281468868),
 (35, 0.012994305230677128),
 (38, 0.013134635984897614),
 (21, 0.014108049683272839),
 (20, 0.0143098

In [176]:
IPython.display.Audio(x[26*step_size:26*step_size+window_size], rate=16000)