In [None]:
%pwd
%cd /app

In [None]:
import torch
import dac
from audiotools import AudioSignal

import numpy as np
import random
import time

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import IPython.display as ipd

In [None]:
# Download a model
model_path = dac.utils.download(model_type="16khz")
model = dac.DAC.load(model_path)
model.to('cuda');

In [None]:
pdata='dacdevdata/16kHz/orig'

In [None]:
# !ls {pdata}

In [None]:
# Load audio signal file
signal = AudioSignal(pdata + '/DSPistons--rate_exp-00.00--x-02.wav') # 2-second sound at 16kHz
signal.shape

<div style="height: 6px; background-color: gray;"></div>
                      ** ORIGINAL **

In [None]:
signal1=signal.audio_data.cpu().detach().numpy()[0,0,:]
plt.plot(signal1)
ipd.Audio(signal1, rate=16000)

In [None]:
# Encode audio signal as one long file
# (may run out of GPU memory on long files)
signal.to(model.device)

x = model.preprocess(signal.audio_data, signal.sample_rate)
with torch.no_grad():
    z, codes, latents, _, _ = model.encode(x)

print(f'shapesare  x:{x.shape}, z:{z.shape}, codes:{codes.shape}, latents:{latents.shape}')

In [None]:
# # Try to code the AudioSignal (wind_x) with diffenent number of quantizers
# # NOTE - this works for this 16kHz model, but notfor the 44kHz model (see morphlatents.ipynb)
# foo1_z, foo1_c, _, _, _ = model.encode(x, n_quantizers=1)
# foo4_z, foo4_c, _, _, _ = model.encode(x, n_quantizers=4)
# fooN_z, fooN_c, _, _, _ = model.encode(x) # expected to use all 9 codebooks

# print(f' Example code slices: \n foo1_c: {foo1_c[0,:,40]} \n foo4_c: {foo4_c[0,:,40]} \n fooN_c: {fooN_c[0,:,40]}\n')
# print(f' And how about the z vectors that we will use to decode?\n')
# print(f' Are foo1_z and foo4_z tensors equal? Ans: {torch.equal(foo1_z, foo4_z)}')
# print(f' Are foo1_z and fooN_z tensors equal? Ans: {torch.equal(foo1_z, fooN_z)}')

In [None]:
# Decode audio signal
with torch.no_grad():
    y = model.decode(z)
print(f'shape of decoded y: {y.shape}')

<div style="height: 6px; background-color: gray;"></div>
                      ** DECODED **

In [None]:
y1=y.cpu().detach().numpy()[0,0,:]
plt.plot(y1)
ipd.Audio(y1, rate=16000)

### <font color='green'> Since the original and reconstructed aren't exactly the same length, it is difficult to look at the difference between the two signals </font>

In [None]:
diffsig=y1-signal.audio_data.cpu().detach().numpy()[0,0,8:] # to match lengths!!
plt.plot(diffsig)
ipd.Audio(diffsig, rate=16000)

### <font color='green'> Compress/decompress go to/from .dac files and signals </font>

In [None]:
# Alternatively, use the `compress` and `decompress` functions
# to compress long files.

signal = signal.cpu()
x_compressed = model.compress(signal)

In [None]:
# Save and load to and from disk
x_compressed.save("/tmp/compressed.dac")

<div style="height: 6px; background-color: gray;"></div>
                      ** decompressed DAC **

In [None]:
x_loaded = dac.DACFile.load("/tmp/compressed.dac")
y2 = model.decompress(x_loaded)
y3=y2.cpu().detach().numpy()[0,0,:]
plt.plot(y3)
ipd.Audio(y3, rate=16000)

<div style="height: 10px; background-color: blue;"></div>
<font size=14, color='blue'> Randomizing codebook entries </font>

### <font color='green'> Lets try to look at just a short excerpt of codes and the signal it would make. Dimensions are (B, Nq, T) </font>

In [None]:
import torch

In [None]:
# I must admit, I don't really know how chunk_length and original_length should be set,
# and how numframes (number of code vectors) plays with all that. Errors arise if they don't fit. 

lengthfactor=12

numframes=25*lengthfactor
chunklength=60  #this seems to work for most length factors > 4
olength=2466*lengthfactor+1

numcodes=4 #I think 12 is the max the codec supports, can be as small as 1

baz = torch.randint(low=1010, high=1013, size=(1, numcodes, numframes)) #between 0 and 1023

my_dac_file = dac.DACFile(
            codes=baz,
            chunk_length=chunklength,
            original_length=olength,
            input_db=-20,
            channels=1,
            sample_rate=16000,
            padding=False,
            dac_version='1.0.0',
        )
#my_dac_file

In [None]:
# And this sounds like .....
ydcompress = model.decompress(my_dac_file)
ysig=ydcompress.cpu().detach().numpy()[0,0,:]
plt.plot(ysig)
ipd.Audio(ysig, rate=16000)

In [None]:
# So lets string a bunch of them together.....

sbuf=[]
sdur=1 # second
for x in range(8,30):
    my_dac_file.codes=torch.randint(low=1010-10*x, high=1013-10*x, size=(1, numcodes, numframes))
    #SAVE my_dac_file.codes=torch.randint(low=1010-4*x, high=1013-3*x, size=(1, numcodes, numframes))
    #SAVE my_dac_file.codes=torch.randint(low=1000-x, high=1003-x, size=(1, numcodes, numframes))
    ydcompress = model.decompress(my_dac_file)
    ysig=ydcompress.cpu().detach().numpy()[0,0,:]
    # ipd.Audio(audio_segment, rate=44100, autoplay=True)
    # time.sleep(sdur+.1)
    sbuf.extend(ysig)

print(f'sbuf.len={len(sbuf)}', flush=True)
plt.plot(ysig)
ipd.Audio(sbuf, rate=16000)

### <font color='green'> OK, so some different kinda fun. Randomize code book indexes! </font>

In [None]:
rcodes = torch.randint(0, 3, size=(1, 12, 204))
#rcodes = torch.ones( size=(1, 12, 204), dtype=torch.int64)*555
rcodes

In [None]:
foo = dac.DACFile.load("/tmp/compressed.dac")
foo.codes=rcodes
ydcompress = model.decompress(foo)
ysig=ydcompress.cpu().detach().numpy()[0,0,:]
plt.plot(ysig)
ipd.Audio(ysig, rate=16000)

### <font color='green'> Well, THAT is pretty interesting for a "universal decoder" - when the code indexes are random, we get something that sounds like a noisy babling male voice!!!!!! </font>  


Now for other systematic manipulations..... they all sound like babblin male voices.....

In [None]:
# entries in the column dimension contain their depth index

depth = 204   # Number of depth slices
rows = 1    # Number of rows (one row)
cols = 12    # Number of columns

# Generate depth indices using torch.arange()
depth_indices = torch.arange(depth).unsqueeze(0).unsqueeze(1)

# Broadcast depth indices to match the desired shape
broadcasted_indices = depth_indices.expand(rows, cols, -1)

# Create the final 3D tensor
result_tensor = broadcasted_indices*4

print(result_tensor)

In [None]:
foo = dac.DACFile.load("/tmp/compressed.dac")
foo.codes=result_tensor  ###############################
ydcompress = model.decompress(foo)
ysig=ydcompress.cpu().detach().numpy()[0,0,:]
plt.plot(ysig)
ipd.Audio(ysig, rate=16000)

<div style="height: 10px; background-color: blue;"></div>