In this notebook, we are interpolating between sounds in the "z" domain - the space of the continuous latent space. The sounds are represented as a time series of latent vectors, to that means we are interpolating between different points in latent space at each point in time.  

The results of this kind of interpolation in the latent space of the codec are not particularly interesting, and sound more like a cross fade (except for the amplitude variation which interpolates between the envelopes of the two sounds at each frame).  

Still, it is interesting to see that the quantized space of the codec ( with (1024^9)^100 = 1024^900 = 10^1.8k is dense enough to handle this range of sounds and mixes with such ghigh fidelity. 

In [1]:
%pwd
%cd /app

import dac
from audiotools import AudioSignal

import torch

import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import IPython.display as ipd

/app


In [2]:
torch.cuda.device_count()
torch.cuda.get_device_properties(0).total_memory/1e9
torch.cuda.get_device_properties(1).total_memory/1e9
device = torch.device("cuda:0") # if the docker was started with --gpus all, then can choose here.
device

device(type='cuda', index=0)

In [3]:
model_path = dac.utils.download(model_type="44khz") 

### This model doesn't sound as good - because it was trained on different data???
# model_path = "/scratch/codecs/codec.pth" # /the default model from vampnet!

model = dac.DAC.load(model_path)

In [4]:
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__CUDA Device Name:',torch.cuda.get_device_name(0))
print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 2
__CUDA Device Name: NVIDIA GeForce RTX 3090
__CUDA Device Total Memory [GB]: 25.447170048


In [5]:
model.to(device); #wanna see the model? remove the semicolon


In [6]:
dataroot="/scratch/dacdevdata" 
# !ls {datadir}

In [7]:
datadir=dataroot+"/44kHz/N4/PisWinAppBee_sparse_recon"

N_QUANTIZERS = 9  ## SEEMS TO HAVE NO EFFECT - I guess because it is a property of the pretrained model?

snd1_wav ='/DSApplause--numClappers_exp-00.50.wav' 
#snd1_wav='/DSPistons--rate_exp-00.50.wav'
snd2_wav = '/DSBugs--busybodyFreqFactor-00.50.wav'
#snd2_wav ='/DSWind--strength-00.50.wav'

CORTADOFACTURA=3  #cut the wavefile lengths by this amount before loading so we don't overrun GPU memory

#1) LOAD A SOUND
snd1 = AudioSignal(datadir + snd1_wav) # 2-second sound at 16kHz
snd1 = snd1[0,0,: int(snd1.shape[2]/CORTADOFACTURA)] # cortado, otherwise the computation will bust memory

#2) PUT IT ON THE GPU
snd1.to(model.device)
#3) PREPROCESS (make sure sr agrees with model, i guess)
snd1_x = model.preprocess(snd1.audio_data, snd1.sample_rate)
#4) ENCODE TO Z, C, and L
snd1_z, snd1_codes, snd1_latents, _, _ = model.encode(snd1_x, N_QUANTIZERS) #model.encode(snd1_x, 4)

snd2 = AudioSignal(datadir + snd2_wav) # 2-second sound at 16kHz
snd2 = snd2[0,0,: int(snd2.shape[2]/CORTADOFACTURA)] # cortado, otherwise the computation will bust memory

snd2.to(model.device)
snd2_x = model.preprocess(snd2.audio_data, snd2.sample_rate)
snd2_z, snd2_codes, snd2_latents, _, _ = model.encode(snd2_x, N_QUANTIZERS) # model.encode(snd2_x, 4)

  data, sample_rate = librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/scratch/dacdevdata/44kHz/N4/PisWinAppBee_sparse_recon/DSApplause--numClappers_exp-00.50.wav'

In [None]:
print(f'snd1 Audio Signal is shape {snd1.shape}')
print(f'snd1_z shape is: {snd1_z.shape}, and snd2_z shape is: {snd2_z.shape}')
print(f'snd1_codes shape is: {snd1_codes.shape}, and snd2_codes shape is: {snd2_codes.shape}')
print(f'snd1_latents shape is: {snd1_latents.shape}, and snd2_latents shape is: {snd2_latents.shape}')

In [None]:
#Lets check a code vector to see if we are using the number specificied above
snd1_codes[0,:,40]

<div style="height: 10px; background-color: blue;"></div>
This next cell is just for reporting a bug do Descript.
It also busts the GPU memory if audio is longer than about a second, so we comment it out.

In [None]:
# # Try to code the AudioSignal (snd1_x) with diffenent number of quantizers
# foo1_z, foo1_c, _, _, _ = model.encode(snd1_x, n_quantizers=1)
# foo4_z, foo4_c, _, _, _ = model.encode(snd1_x, n_quantizers=4)
# fooN_z, fooN_c, _, _, _ = model.encode(snd1_x) # expected to use all 9 codebooks

# print(f' Example code slices: \n foo1_c: {foo1_c[0,:,40]} \n foo4_c: {foo4_c[0,:,40]} \n fooN_c: {fooN_c[0,:,40]}\n')
# print(f' And how about the z vectors that we will use to decode?\n')
# print(f' Are foo1_z and foo4_z tensors equal? Ans: {torch.equal(foo1_z, foo4_z)}')
# print(f' Are foo1_z and fooN_z tensors equal? Ans: {torch.equal(foo1_z, fooN_z)}')

In [None]:
### Project Latents Experiment!!!!!!!!!!!!!!!!!
### Lets just do a reality check that if we "manually" take the 8D latents to 1024D z and then decode, 
### it should be the same as the z we got from model encode.

snd2_z_from_l,_,_ = model.quantizer.from_latents(snd2_latents)
print(f'snd2_z_from_l shape is: {snd2_z_from_l.shape}')
torch.dist(snd2_z_from_l, snd2_z, p=2)


### <font color='blue'> First decode both sounds a play them </font>

In [None]:
snd2recon = model.decode(snd2_z_from_l) #z_from_l or z from encode are the same

snd2reconsignal = snd2recon[0,0,:].cpu().detach().numpy()
plt.plot(snd2reconsignal)
ipd.Audio(snd2reconsignal, rate=44100)


In [None]:
snd1recon = model.decode(snd1_z)

snd1reconsignal = snd1recon[0,0,:].cpu().detach().numpy()
plt.plot(snd1reconsignal)
ipd.Audio(snd1reconsignal, rate=44100)

#original 
#snd1signal=snd1.audio_data.cpu().detach().numpy()[0,0,:]
#plt.plot(snd1signal)
#ipd.Audio(snd1signal, rate=44100)

In [None]:

# def assign_values_with_mask(value, v2, mask):
#     # Ensure v2 and mask have the same length
#     assert len(v2) == len(mask), "v2 and mask must have the same length"

#     # If v1 is a single number, expand it to have the same shape as v2
#     if isinstance(value, (int, float)):
#         v1 = torch.full_like(v2, value)
#     else :
#         print('error: first arg must be a float or int to fill vector where mask is `True`')

#     # Use the mask to assign values from v1 to v2
#     result = torch.where(mask, v1, v2)

#     return result

### <font color='blue'> Now do n interpolation in the *latent* space (the latent space is the "projected" space and has only 8 dimensions for each codebook) </font>

In [None]:
# "morph" between two time-indexed sequences of latent variables
# Interpoloates between i1*va+(1-i1)vb to i2*va+(1-i2)vb
def interp(va, vb, i1, i2) : 
    assert va.shape == vb.shape, "Tensors must have the same shape"
    timesteps=va.shape[2]
    linear_values = torch.linspace(i1, i2, timesteps, device=device)
    complementlinear_values = 1-linear_values 

    return linear_values * va + complementlinear_values * vb

In [None]:

# la=snd2_latents
# lb=snd1_latents
# linterp=interp(la, lb, .6, .4)  ##Interpolate in the 8D projected space!
# print(f'linterp.shape is {linterp.shape}') 

# z_from_l,_,_ = model.quantizer.from_latents(linterp)
# y = model.decode(z_from_l)
# print(f'signal y.shape is {y.shape}') 
# mix_signal = y[0,0,:].cpu().detach().numpy()
# plt.plot(mix_signal)
# ipd.Audio(mix_signal, rate=44100)


### <font color='blue'> Now rather than interpolating in latent space, swap value in the dimensions with even index. Each sound is now made up of latents where have the values come from one sound, half the values from the other.

In [None]:
# This function takes the latent variable at each time step, 
#and swaps their invidual values in the dimensions with an even index
# Thus each matrix at each time step has half its values from one sound,
# and the other half from the other. 

# does this in place - modifies the original matrices
def swap_elements_for_even_n(matrix1, matrix2):
    # Get the shape of the input matrices
    _, n, m = matrix1.size()

    # Create masks for even and odd indices along the n dimension
    even_mask = torch.arange(n) % 2 == 1  # Indices where n is even
    odd_mask = torch.arange(n) % 2 == 0   # Indices where n is odd

    # Select elements where n is even from both matrices
    even_elements_matrix1 = matrix1[:, even_mask, :]
    even_elements_matrix2 = matrix2[:, even_mask, :]

    # Swap the even elements between the matrices
    matrix1[:, even_mask, :] = even_elements_matrix2
    matrix2[:, even_mask, :] = even_elements_matrix1

    return matrix1, matrix2

## test
# a= torch.rand(1, 4, 2)
# b= torch.rand(1, 4, 2)
# print(a)
# print(b)
# print('--------')
# swap_elements_for_even_n(a, b)
# print(a)
# print(b)


In [None]:
la=snd2_latents
lb=snd1_latents
# first do a swap of values in the latent space for even-numbered dimensions
la, lb = swap_elements_for_even_n(la, lb)
linterp=interp(la, lb, 0, 0)  ## (0,0 plays the first sound only, 1,1 plays the second sound only)
print(f'linterp.shape is {linterp.shape}') 

z_from_l,_,_ = model.quantizer.from_latents(linterp)
y = model.decode(z_from_l)
print(f'signal y.shape is {y.shape}') 
mix_signal = y[0,0,:].cpu().detach().numpy()
plt.plot(mix_signal)
ipd.Audio(mix_signal, rate=44100)

### <font color='blue'> Now look at the histogram of the values in the latent variables (choose one in the next sell by assigning it to examinevector).

In [None]:
examinevector=snd1_latents
examinevector.shape

In [None]:
smallest_value = torch.min(examinevector)
largest_value = torch.max(examinevector)
print(f"The smallest value is {smallest_value.item()}")
print(f"The largest value is {largest_value.item()} ")

In [None]:
# Flatten the tensor to 1D
numpy_array = examinevector.cpu().detach().numpy().flatten()

# Define the number of bins
num_bins = 100

# Create a histogram using NumPy to compute bin edges and counts
hist, bin_edges = np.histogram(numpy_array, bins=num_bins)

# Compute the average values for each bin
bin_avg_values = []
for i in range(num_bins):
    mask = np.logical_and(numpy_array >= bin_edges[i], numpy_array <= bin_edges[i + 1])
    bin_avg = np.mean(numpy_array[mask]) if np.sum(mask) > 0 else 0
    bin_avg_values.append(bin_avg)

# Plot the histogram
plt.figure(figsize=(8, 6))
plt.hist(numpy_array, bins=num_bins)
plt.title('Histogram of 3D Tensor Values')
plt.xlabel('Values')
plt.ylabel('Frequency')

# Calculate positions for the labels spread across the horizontal width of the image
# label_positionsx = np.linspace(-num_bins/2, num_bins/2, num_bins)
# label_positionsy = linspace_v(0, np.max(hist), num_bins)

#print(label_positions)
# Annotate the plot with average values, rotating labels 90 degrees clockwise
for i in range(num_bins):
    label = f'{bin_avg_values[i]:.2f}'

plt.grid(True)
plt.show()

In [None]:
z,_,_ = model.quantizer.from_latents(snd2_latents)
snd = model.decode(z) #z_from_l or z from encode are the same

recon = snd[0,0,:].cpu().detach().numpy()
plt.plot(recon)
ipd.Audio(recon, rate=44100)

