## RAVE evaluation

In [1]:
import torch
import torch.nn as nn
import sys
import os

import torch.nn.functional as F
from librosa.filters import mel as librosa_mel_fn
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd

sys.path.append("../src")

In [2]:
from deep_eurorack_control.models.rave import RAVE
from deep_eurorack_control.datasets.data_loaders import nsynth_data_loader
from deep_eurorack_control.config import settings



## Load trained models

In [3]:
models_dir = "/home/sarah/Projects/master_atiam/pam/deep-eurorack-control/models"

#### VAE n_band = 1

In [4]:
check_vae_1 = "n_synth_rave__n_band_1__latent_128__sr_16000__noise_False__init_weights_True__b_8__lr_0.0001__e_150__e_warmup_150__seed_0__vae.pt"
checkpoint_vae_1 = torch.load(os.path.join(models_dir, check_vae_1), map_location=torch.device('cpu'))

vae_1 = RAVE(
    n_band=1,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=False,
    init_weights=True
)

vae_1.encoder.load_state_dict(checkpoint_vae_1['encoder_state_dict'])
vae_1.decoder.load_state_dict(checkpoint_vae_1['decoder_state_dict'])
vae_1.encoder.eval()
vae_1.decoder.eval()
print("ok")

ok


#### VAE n_band = 4

In [6]:
check_vae_4 = ""
checkpoint_vae_4 = torch.load(os.path.join(models_dir, check_vae_4), map_location=torch.device('cpu'))

vae_4 = RAVE(
    n_band=4,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=False,
    init_weights=True
)

vae_4.encoder.load_state_dict(checkpoint_vae_4['encoder_state_dict'])
vae_4.decoder.load_state_dict(checkpoint_vae_4['decoder_state_dict'])
vae_4.encoder.eval()
vae_4.decoder.eval()
print("ok")

IsADirectoryError: [Errno 21] Is a directory: '/home/sarah/Projects/master_atiam/pam/deep-eurorack-control/models/'

#### VAE n_band = 8

In [11]:
check_vae_8 = "n_synth_rave__n_band_8__latent_128__sr_16000__noise_False__init_weights_True__b_8__lr_0.0001__e_150__e_warmup_150__seed_0__vae.pt"
checkpoint_vae_8 = torch.load(os.path.join(models_dir, check_vae_8), map_location=torch.device('cpu'))

vae_8 = RAVE(
    n_band=8,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=False,
    init_weights=True
)

vae_8.encoder.load_state_dict(checkpoint_vae_8['encoder_state_dict'])
vae_8.decoder.load_state_dict(checkpoint_vae_8['decoder_state_dict'])
vae_8.encoder.eval()
vae_8.decoder.eval()
print("ok")

ok


#### VAE n_band = 16

In [13]:
check_vae_16 = "n_synth_rave__n_band_16__latent_128__sr_16000__noise_False__init_weights_True__b_8__lr_0.0001__e_150__e_warmup_150__seed_0__vae.pt"
checkpoint_vae_16 = torch.load(os.path.join(models_dir, check_vae_16), map_location=torch.device('cpu'))

vae_16 = RAVE(
    n_band=16,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=False,
    init_weights=True
)

vae_16.encoder.load_state_dict(checkpoint_vae_16['encoder_state_dict'])
vae_16.decoder.load_state_dict(checkpoint_vae_16['decoder_state_dict'])
vae_16.encoder.eval()
vae_16.decoder.eval()
print("ok")

ok


#### VAE n_band = 8 GAN without noise

In [8]:
check_rave_8_no_noise = "n_synth_rave__n_band_8__latent_128__sr_16000__noise_False__init_weights_True__b_8__lr_0.0001__e_250__e_warmup_150__vae.pt"
checkpoint_rave_no_noise = torch.load(os.path.join(models_dir, check_rave_8_no_noise), map_location=torch.device('cpu'))

rave_no_noise = RAVE(
    n_band=8,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=False,
    init_weights=True
)

rave_no_noise.encoder.load_state_dict(checkpoint_rave_no_noise['encoder_state_dict'])
rave_no_noise.decoder.load_state_dict(checkpoint_rave_no_noise['decoder_state_dict'])
rave_no_noise.encoder.eval()
rave_no_noise.decoder.eval()
print("ok")

ok


#### VAE n_band = 8 GAN with noise

In [4]:
check_rave_8_noise = "n_synth_rave__n_band_8__latent_128__sr_16000__noise_True__init_weights_True__b_8__lr_0.0001__e_250__e_warmup_150__vae.pt"
checkpoint_rave_noise = torch.load(os.path.join(models_dir, check_rave_8_noise), map_location=torch.device('cpu'))

rave_noise = RAVE(
    n_band=8,
    latent_dim=128,
    hidden_dim=64,
    sampling_rate=16000,
    use_noise=True,
    init_weights=True
)

rave_noise.encoder.load_state_dict(checkpoint_rave_noise['encoder_state_dict'])
rave_noise.decoder.load_state_dict(checkpoint_rave_noise['decoder_state_dict'])
rave_noise.encoder.eval()
rave_noise.decoder.eval()
print("ok")

ok


## Load Test Dataset Nsynth

In [5]:
test_loader, _ = nsynth_data_loader(
    batch_size=8,
    data_dir="/home/sarah/Projects/master_atiam/pam/deep-eurorack-control/data",
    audio_dir="/home/sarah/Projects/master_atiam/pam/nsynth-test/audio",
    nsynth_json="nsynth_string_test.json",
    valid_ratio=0
)

### Latent space analysis

In [6]:
def latent_space_pca_analysis(model, test_loader, latent_dim=128):
    z_list = []
    s_list = []
    for s, _ in test_loader:
        s_list.append(s)
        s = torch.reshape(s, (s.shape[0], 1, -1))

        # 1. multi band decomposition pqmf
        s = model.multi_band_decomposition(s)

        # 2. Encode data
        mean, var = model.encoder(s)

        # z, _ = model.reparametrize(mean, var)
        z = mean
        z_list.append(z)
    
    z_valid = torch.cat(z_list, 0)
    print(f"nb samples : {z_valid.shape[0]}")
    z_valid = z_valid.reshape(-1, z_valid.shape[1])
    latent_mean = z_valid.mean(0)
    z_center = z_valid - latent_mean
    
    pca = PCA(latent_dim).fit(z_center.detach().cpu().numpy())
    components = pca.components_
    components = torch.from_numpy(components).to(z_center)

    var = pca.explained_variance_ / np.sum(pca.explained_variance_)
    var = np.cumsum(var)
    
    var_percent = [.8, .9, .95, .99]
    for p in var_percent:
        print(f"{p}%_manifold", np.argmax(var > p))

In [7]:
latent_space_pca_analysis(rave_noise, test_loader)

nb samples : 306
0.8%_manifold 2
0.9%_manifold 2
0.95%_manifold 5
0.99%_manifold 30


In [9]:
latent_space_pca_analysis(rave_no_noise, test_loader)

nb samples : 306
0.8%_manifold 0
0.9%_manifold 1
0.95%_manifold 1
0.99%_manifold 12


In [12]:
latent_space_pca_analysis(vae_8, test_loader)

nb samples : 306
0.8%_manifold 0
0.9%_manifold 1
0.95%_manifold 2
0.99%_manifold 12


In [14]:
latent_space_pca_analysis(vae_16, test_loader)

nb samples : 306
0.8%_manifold 1
0.9%_manifold 4
0.95%_manifold 11
0.99%_manifold 37


## Reconstruction error

In [6]:
device = torch.device('cpu')

In [7]:
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)

def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output

In [8]:
def mel_spectrogram(x, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    #if torch.min(x) < -1.:
    #    print('min value is ', torch.min(x))
    #if torch.max(x) > 1.:
    #    print('max value is ', torch.max(x))

    global mel_basis, hann_window, device
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
        mel_basis[str(fmax)+'_'+str(x.device)] = torch.from_numpy(mel).float().to(x.device)
        hann_window[str(x.device)] = torch.hann_window(win_size).to(x.device)

    
    x = torch.nn.functional.pad(x.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    x = x.squeeze(1)

    melspectro = torch.stft(x, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(x.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True)

    melspectro = torch.sqrt(melspectro.pow(2).sum(-1)+(1e-9))

    melspectro = torch.matmul(mel_basis[str(fmax)+'_'+str(x.device)], melspectro)
    melspectro = spectral_normalize_torch(melspectro)

    return melspectro

In [9]:
def melspectrogram_loss(x, x_gen, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    
    x_mel = mel_spectrogram(x, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center)
    x_gen_mel = mel_spectrogram(x_gen, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center)
    
    loss_melspectro = F.l1_loss(x_mel, x_gen_mel)

    return loss_melspectro

In [10]:
mel_basis = {}
hann_window = {}

n_fft = 1024
num_mels = 80
sampling_rate = 48000
hop_size = 256
win_size = 1024
fmin = 0
fmax = 8000

In [11]:
audio, pitch = next(iter(test_loader))
melspectrogram_loss(audio, audio, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False)

  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


tensor(0.)

In [13]:
def inference(model, test_loader):
    x_list = []
    y_list = []
    for x, _ in test_loader:
        x_list.append(x)
        x = torch.reshape(x, (x.shape[0], 1, -1))

        # 1. multi band decomposition pqmf
        x = model.multi_band_decomposition(x)

        # 2. Encode data
        mean, var = model.encoder(x)

        # z, _ = model.reparametrize(mean, var)
        z = mean
        
        y = model.decoder(z)
        y = model.multi_band_decomposition.inverse(y)
        y = y.reshape(y.shape[0], -1)
        y_list.append(y)
    
    x_test = torch.cat(x_list, 0)
    y_test = torch.cat(y_list, 0)
    return x_test, y_test

In [None]:
x_test, y_test = inference(rave_noise, test_loader)

In [12]:
def evaluate_model_mel_spec(model, test_loader):
    mel_loss = []
    for s, _ in test_loader:
        x = torch.reshape(s, (s.shape[0], 1, -1))

        # 1. multi band decomposition pqmf
        x = model.multi_band_decomposition(x)

        # 2. Encode data
        mean, var = model.encoder(x)

        # z, _ = model.reparametrize(mean, var)
        z = mean
        
        y = model.decoder(z)
        y = model.multi_band_decomposition.inverse(y)
        y = y.reshape(y.shape[0], y.shape[-1])
        #res.append(y.reshape(-1).to("cpu").detach().numpy())
        mel_basis = {}
        hann_window = {}
        loss_i = melspectrogram_loss(s, y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False)
        mel_loss.append(torch.mean(loss_i))
    return res     
        

In [None]:
evaluate_model_mel_spec(rave_noise, test_loader)

min value is  tensor(-1.0293, grad_fn=<MinBackward1>)
max value is  tensor(1.1945, grad_fn=<MaxBackward1>)
min value is  tensor(-1.1362, grad_fn=<MinBackward1>)
max value is  tensor(1.2580, grad_fn=<MaxBackward1>)
min value is  tensor(-1.0193, grad_fn=<MinBackward1>)
max value is  tensor(1.1029, grad_fn=<MaxBackward1>)
