In [None]:
DEVICE='cpu' # either 'cuda' or 'cpu'

%pwd
%cd /app

In [None]:
import dac
from audiotools import AudioSignal
import torch
import numpy as np
import random
import time
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import IPython.display as ipd

In [None]:
# Download a DAC model
model_path = dac.utils.download(model_type="44khz")
model = dac.DAC.load(model_path)
model.to(DEVICE); # model.to('cuda');

<div style="height: 10px; background-color: blue;"></div>
<span style="font-size: 24px; color:blue">Randomizing codebook entries </span>


In [None]:
# Takes a sequence of code vectors and decompresses them to audio

def Codes2Audio(codes, numframes, verbose=0):
    if verbose>=1 :
        print(f'shape of code tensor is {codes.shape}')
    if verbose>=10 :
        print(codes)   

    # Clip to preserve reconstruction sanity
    if torch.any(codes > 1023) :
        mask = codes > 1023
        codes[mask] = 0
        print(f'clipping some codebook entries to max 1023')
    if torch.any(codes < 0) :
        mask = codes < 0
        codes[mask] = 0
        print(f'clipping some codebook entries to min  0')

    # I need to find more DAC documentation to understand these length unit parameters, but the are needed for the DACFile and seem to sork for reloading and decoding
    chunklength=60  #this seems to work for most length factors > 4
    assert numframes % 25 == 0, f'frames must be divisible by 25 (for some reason)' 
    olength=2466*(numframes//25)+1

    #Create the DACFile datastructure, but no need to write to disk in order to decompress it
    my_dac_file = dac.DACFile(
            codes=codes,
            chunk_length=chunklength,
            original_length=olength,
            input_db=-20,
            channels=1,
            sample_rate=44100,
            padding=False,
            dac_version='1.0.0',
        )
    
    ydcompress = model.decompress(my_dac_file)
    ysig=ydcompress.cpu().detach().numpy()[0,0,:]
    return ysig

### <font color='green'> Lets try to look at just at some random code sequences and the signal they would make. Dimensions are (B, Nq, T) </font>

In [None]:
# generate random codebook entries 
    # @a - lowest random value
    # @b - highest random value
    # @lengthfactor - for some reason, multiples of 25 frames seems to work 
    # @numcodebooks - the number of codebook indicies to use for each frame

def codeMashRandom(a, b, numframes, numCodebooks=4,  verbose=0) :
    # now make the code tensor
    # a is range low, b is range high
    baz = torch.randint(low=a, high=b, size=(1, numCodebooks, numframes)) #between 0 and 1023
    return Codes2Audio(baz, numframes, verbose)

#----------------------------------------------------------------------------------------
# generate random codebook entries 
    # @a - lowest random value
    # @b - step
    # @lengthfactor - for some reason, multiples of 25 frames seems to work 
    # @numcodebooks - the number of codebook indicies to use for each frame

    # Example: codeMashSequence(0, 1, 1200)
    # shape of code tensor is torch.Size([1, 4, 1200])
    # tensor([[[   0,    1,    2,  ..., 1197, 1198, 1199],
    #          [   0,    1,    2,  ..., 1197, 1198, 1199],
    #          [   0,    1,    2,  ..., 1197, 1198, 1199],
    #          [   0,    1,    2,  ..., 1197, 1198, 1199]]])

def codeMashSequence(a, b, numframes, numCodebooks=4,  verbose=0) :

    #This increases code indexes with time, all codes the same at any time
    # a is start, b is step
    # Generate depth indices using torch.arange()
    depth_indices = torch.arange(numframes).unsqueeze(0).unsqueeze(1)
    # Broadcast depth indices to match the desired shape
    broadcasted_indices = depth_indices.expand(1, numCodebooks, -1)
    # Create the final 3D tensor
    codes=broadcasted_indices*b+a

    return Codes2Audio(codes, numframes, verbose)

In [None]:
#### And this sounds like .....
ysig=codeMashRandom(0, 1023, 100*25, numCodebooks=4, verbose=10)
# ysig = codeMashRandom(604, 605, 25*25,  numCodebooks=9, verbose=10)
# ysig = codeMashSequence(0, 1, 48*25, verbose=10 )


plt.plot(ysig)
ipd.Audio(ysig, rate=44100)

In [None]:
# So lets string a bunch of them together.....

framesperchunk=100
sbuf=[]
sdur=1 # sleep time in seconds

start_time = time.time()
for x in range(8,30):
    
    #SAVE ysig=codeMash(1010-10*x, 1013-10*x)
    #SAVE ysig=codeMash(1010-4*x, 1013-3*x)
    ysig=codeMashRandom(1000-x, 1003-x, framesperchunk)

    # Can't get this to play in the loop.....
    # ipd.Audio(audio_segment, rate=44100, autoplay=True)
    # time.sleep(sdur+.1)
    
    sbuf.extend(ysig)
end_time = time.time()

print(f"Codebook and audio generation time: {end_time - start_time} seconds")
print(f'sbuf.len={len(sbuf)}, sound is {len(sbuf)/44100} seconds long', flush=True)

plt.plot(ysig) # just view last generated chunk of framesperchunk frames
ipd.Audio(sbuf, rate=44100)

<div style="height: 10px; background-color: blue;"></div>