In [12]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
from tqdm import tqdm
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import yaml
import os, random
import librosa
import soundfile as sf
import numpy as np
from utils import *

from vscode_audio import *



In [13]:
N_LATENT = 512
N_MLP = 8
SIZE_OUTPUT = 64 # size of output image

#CHECKPOINT = "./looperman_one_bar_checkpoint.pt"
CHECKPOINT = "./freesound_checkpoint.pt"

DATAPATH = "./data/looperman/"

TRUNCATION = 1
TRUNCATION_MEAN = 4096

SR = 44100

device_name = "cpu"


In [14]:


generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
checkpoint = torch.load(CHECKPOINT, map_location=torch.device('cpu'))

generator.load_state_dict(checkpoint["g_ema"], strict=False)


if TRUNCATION < 1:
    with torch.no_grad():
        mean_latent = generator.mean_latent(TRUNCATION_MEAN)
else:
    mean_latent = None


In [15]:
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', 'best_netG.pt')
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device('cpu')), strict=False)

    return vocoder, v_mean, v_std
VOCODER, V_MEAN, V_STD = load_vocoder(device_name)

In [16]:
def vocode(sample, vocoder=VOCODER, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output

In [17]:
prev_sample_z = None

def generate(g_ema, device, mean_latent, ckpt_name, sample_z = None, truncation=TRUNCATION, prev_coef=0.0):
    global prev_sample_z
    #epoch = ckpt_name.split('.')[0]
    # os.makedirs(f'./tmp/{epoch}', exist_ok=True)
    # os.makedirs(f'./tmp/{epoch}/mel_80_320', exist_ok=True)


    with torch.no_grad():
        g_ema.eval()
#        for i in tqdm(range(args.pics)):
        if sample_z is None:
            if prev_sample_z is None or prev_coef == 0.0:
                sample_z = torch.randn(1, N_LATENT, device=device)
            else:
                sample_z = prev_sample_z + torch.randn(1, N_LATENT, device=device) * prev_coef

        sample, _ = g_ema([sample_z], truncation=truncation, truncation_latent=mean_latent)

        prev_sample_z = sample_z
#        np.save(f'./tmp/{epoch}/mel_80_320/{i}.npy', sample.squeeze().data.cpu().numpy())
        # print(sample)
        imagepath = f'/tmp/img_{random.randint(0, 10000)}.png'
        utils.save_image(sample, imagepath, nrow=1,normalize=True,range=(-1, 1))
        audio_output = vocode(sample)
        return audio_output.squeeze().cpu().numpy(), imagepath
            # sf.write(f'{args.store_path}/{epoch}/{i}.wav', audio_output.squeeze().detach().cpu().numpy(), sr)
            # print('generate {}th wav file'.format(i))

In [18]:
import math

z_coord = torch.randn(4, N_LATENT, device=device_name)

def coord_to_z(x, y):
    assert x >= 0 and x <= 1
    assert y >= 0 and y <= 1

    z =  ((1-math.sqrt(x**2 + y**2)) * z_coord[0] + (1 - math.sqrt((1-x)**2+y**2)) * z_coord[1] + \
    (1 - math.sqrt(x**2 + (1-y)**2)) * z_coord[2] + (1 - math.sqrt((1-x)**2+(1-y)**2)) * z_coord[3]) 
    z = torch.unsqueeze(z, 0)
    return z

In [19]:
sample_z = coord_to_z(0, 0)
print(sample_z[:10], z_coord[0][:10])
sample_z = coord_to_z(0, 1)
print(sample_z[:10], z_coord[1][:10])

tensor([[-2.9220e-01,  2.7360e-02,  6.8455e-01,  1.1071e+00, -1.0681e-01,
          2.2082e+00,  1.3873e+00, -9.1338e-01,  1.0332e+00,  6.3251e-01,
          1.0682e-02, -4.7676e-01, -1.6639e-01,  9.4867e-01,  8.5300e-01,
          5.4313e-01, -1.6336e-01, -4.6459e-01, -5.7149e-02,  6.0553e-01,
          1.9692e+00, -9.5882e-02, -5.8170e-01, -5.4579e-01,  4.3015e-01,
          1.1664e+00, -1.7634e-01,  1.5640e+00, -9.0305e-01, -5.3954e-01,
          1.0579e+00,  2.3639e-01, -5.5244e-01, -3.5434e-01,  3.6370e-01,
         -3.2123e-01, -1.1806e+00,  5.5049e-01, -9.1938e-01, -1.4136e+00,
          1.6112e-01, -9.9558e-01,  2.7219e-02, -8.0873e-01, -8.9362e-01,
          3.9096e-02, -4.7655e-01, -2.8376e-01, -1.0910e+00, -5.8217e-02,
          3.6904e-01,  2.0559e-01, -4.9028e-01, -5.2581e-01,  5.1628e-02,
         -1.1131e+00,  2.9565e-01,  7.7260e-01, -5.5058e-01, -3.6471e-01,
          5.5950e-01,  1.7781e+00, -2.1728e+00,  3.2702e-01,  8.5349e-02,
         -1.7176e-01, -1.6178e+00,  8.

In [20]:

# sample_z = coord_to_z(0, 0.5)
# audio_output = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=sample_z)
# audio_output.shape

# Audio(audio_output, sr=SR)

In [22]:
# from vscode_audio import *

# Audio(audio_output, sr=SR)

# Audio(audio_output, sr=SR)

In [24]:
from pythonosc import dispatcher
from pythonosc import osc_server, udp_client
import os, random

client = udp_client.SimpleUDPClient('127.0.0.1', 10018)

def generate_random(unused_addr, prev_coef):
    # try:
    audio_output, imagepath = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=None, prev_coef=prev_coef) # random sample
    
    filepath = f'/tmp/gem_{random.randint(0, 10000)}.wav'
    sf.write(filepath, audio_output, SR)

    client.send_message("/generated", (filepath, imagepath))
    #except Exception as exp:
    # print("Error in /find_loops", exp)        
dispatcher = dispatcher.Dispatcher()
dispatcher.map("/generate", generate_random)

server = osc_server.ThreadingOSCUDPServer(
    ('localhost', 10015), dispatcher)
print("Serving on {}".format(server.server_address))
server.serve_forever()




Serving on ('127.0.0.1', 10015)
