In [1]:
import os

os.chdir('..')

from predict_ae import autoencode
import librosa
from IPython.display import Audio
import dienen
import copy

from swissknife.dsp import stft, get_default_window,calculate_synthesis_window, pghi, istft
import numpy as np
import copy
import threading
import pyaudio
import glob

In [2]:
config = {'input_frames': 16,
          'window_size': 1024,
          'hop_size': 256,
          'log': True,
          'crop_nyquist': True}

model_config = 'models/gsvqvae.yaml'
model_weights = 'weights/gsvqvae_16frames.hdf5'
ae_model = dienen.Model(model_config)
ae_model.build()
ae_model.core_model.model.summary()
ae_model.core_model.model.load_weights(model_weights)

encoder = copy.deepcopy(ae_model)

ae_model.modify([{'input_bottleneck': {'input': 'x', 'shape': [1,13,512], 'class': 'Input'}},
                 {'discrete_bottleneck/input': 'input_bottleneck'}],
               inputs=['input_bottleneck'],
               outputs=['estimated_spectrogram'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
spectrogram_in (InputLayer)  [(None, 16, 512)]         0         
_________________________________________________________________
translated_spectrogram (Tran (None, 16, 512)           0         
_________________________________________________________________
ExpandDims_0 (ExpandDims)    (None, 16, 512, 1)        0         
_________________________________________________________________
encoder_0/Conv_0 (Conv2D)    (None, 16, 128, 128)      12416     
_________________________________________________________________
encoder_0/BatchNormalization (None, 16, 128, 128)      512       
_________________________________________________________________
encoder_0/Activation_0 (Acti (None, 16, 128, 128)      0         
_________________________________________________________________
encoder_1/Conv_0 (Conv2D)    (None, 16, 64, 128)       393344

In [3]:
def interpolate_embeddings(key_points, n_frames, ae_model):
    key_points = [librosa.core.load(k,sr=None)[0] for k in key_points]
    stfts = [stft(x,config['window_size'],config['hop_size'],window=get_default_window(config['window_size'])[0]) for x in key_points]
    stfts = np.array([x[100:100+config['input_frames'],:] for x in stfts])
    if config['log']:
        stfts = np.log(stfts + 1e-16)
    if config['crop_nyquist']:
        stfts = stfts[:,:,:-1]
    embeddings = ae_model.predict(stfts,output='encoder_4/Activation_0')['encoder_4/Activation_0']
    frames_per_keypoint = n_frames//len(embeddings)
    interpolated_embeddings = []
    for embedding_start, embedding_end in zip(embeddings[:-1],embeddings[1:]):
        delta_embedding = embedding_end - embedding_start
        interpolation = embedding_start + delta_embedding*np.linspace(0,1,frames_per_keypoint)[:,np.newaxis,np.newaxis]
        interpolated_embeddings.append(interpolation)
            
    return interpolated_embeddings

def decode(embeddings, decoder_model):
    Y_frames = decoder_model.core_model.model.predict(embeddings)
    print(Y_frames.shape)
    Y_frames = np.squeeze(Y_frames)
    if config['crop_nyquist']:
        Y_frames = np.pad(Y_frames,((0,0),(0,0),(0,1))) + 1e-16
    if config['log']:
        Y_frames = np.exp(Y_frames)
        
    frame_win = np.hamming(16)
    Y_frames = frame_win[np.newaxis,:,np.newaxis]*Y_frames
    
    Y = np.zeros(shape=(Y_frames.shape[0]*(Y_frames.shape[1]//2)+16,Y_frames.shape[2]))
    for i,frame in enumerate(Y_frames):
        Y[i*8:i*8+16] += frame
    synth_window = calculate_synthesis_window(win_length=config['window_size'], hop_length=config['hop_size'], n_fft=config['window_size'],window=get_default_window(config['window_size'])[0])
    #Y = np.abs(Y)*np.exp(1.0j*pghi(Y,config['window_size'],config['hop_size'],synthesis_window=synth_window))
    Y = np.abs(Y)*np.exp(1.0j*np.random.uniform(low=-np.pi,high=np.pi,size=Y.shape))
    y = istft(Y,config['window_size'],config['hop_size'],synthesis_window=synth_window)
    
    return y

In [4]:
class AudioPlayer(threading.Thread):
    def __init__(self,fs=44100,buffer_size=1024,max_queue=1e6):
        self.queue = np.zeros((int(max_queue),))
        self.player = pyaudio.PyAudio()
        input_device = self.player.get_default_input_device_info()['index']
        output_device = self.player.get_default_output_device_info()['index']
        self.buffer_size=buffer_size
        self.stream = self.player.open(format = pyaudio.paFloat32,
                                       channels=1,
                                       rate=fs,
                                       output=True, 
                                       frames_per_buffer=buffer_size,
                                       input_device_index=input_device,
                                       output_device_index=output_device)
        self.read_index = 0
        self.write_index = 0
        self.is_playing = False
        
    def write(self,data):
        if self.read_index>0:
            self.queue[:-self.read_index] = self.queue[self.read_index:]
            self.queue[-self.read_index:] = 0
            self.write_index -= self.read_index
            self.read_index = 0

        self.queue[self.write_index: self.write_index + len(data)] = data
        self.write_index += len(data)
        
    def play(self):
        self.is_playing = True
    
    def run(self):
        while True:
            if self.read_index<self.write_index:
                data = self.queue[self.read_index:self.read_index+self.buffer_size]
                self.read_index += self.buffer_size
            else:
                data = np.zeros((self.buffer_size,))
                print('Waiting')
            data = data.astype(np.float32)
            data = data.tostring()
            self.stream.write(data)
        

In [5]:
all_nsynth = list(glob.glob('nsynth/nsynth-test.jsonwav/nsynth-test/audio/*.wav'))

audio_embedding_1 = all_nsynth[np.random.randint(low=0,high=len(all_nsynth))]
audio_embedding_2 = all_nsynth[np.random.randint(low=0,high=len(all_nsynth))]
audio_engine = AudioPlayer()
audio_thread = threading.Thread(target = audio_engine.run, daemon = True)
audio_thread.start()
audio_engine.play()
while True:
    key_points=[audio_embedding_1,audio_embedding_2]
    embeddings = interpolate_embeddings(key_points,300,encoder)
    interpolated_embeddings = np.concatenate(embeddings)
    interpolated_embeddings = np.expand_dims(interpolated_embeddings,axis=1)
    y = decode(np.array(interpolated_embeddings),ae_model)
    audio_engine.write(y)
    audio_embedding_1 = audio_embedding_2
    audio_embedding_2 = all_nsynth[np.random.randint(low=0,high=len(all_nsynth))]

Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting




Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


  gsynth[l] = window[l]/denom


Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
(150, 16, 512, 1)
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waitin

Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
(150, 16, 512, 1)
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waitin

Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting
Waiting


KeyboardInterrupt: 

Waiting
