In [7]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
from tqdm import tqdm
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import yaml
import os, random
import librosa
import soundfile as sf
import numpy as np
from utils import *

from vscode_audio import *



In [8]:
N_LATENT = 512
N_MLP = 8
SIZE_OUTPUT = 64 # size of output image

#CHECKPOINT = "./looperman_one_bar_checkpoint.pt"
CHECKPOINT = "./freesound_checkpoint.pt"

DATAPATH = "./data/looperman/"

TRUNCATION = 1
TRUNCATION_MEAN = 4096

SR = 44100

device_name = "cpu"


In [9]:


generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
checkpoint = torch.load(CHECKPOINT, map_location=torch.device('cpu'))

generator.load_state_dict(checkpoint["g_ema"], strict=False)


if TRUNCATION < 1:
    with torch.no_grad():
        mean_latent = generator.mean_latent(TRUNCATION_MEAN)
else:
    mean_latent = None


In [10]:
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', 'best_netG.pt')
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device('cpu')), strict=False)

    return vocoder, v_mean, v_std
VOCODER, V_MEAN, V_STD = load_vocoder(device_name)

In [11]:
def vocode(sample, vocoder=VOCODER, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output

In [12]:
prev_sample_z = None

def generate(g_ema, device, mean_latent, ckpt_name, sample_z = None, truncation=TRUNCATION, prev_coef=0.0):
    global prev_sample_z
    #epoch = ckpt_name.split('.')[0]
    # os.makedirs(f'./tmp/{epoch}', exist_ok=True)
    # os.makedirs(f'./tmp/{epoch}/mel_80_320', exist_ok=True)
    n_samples = 4

    with torch.no_grad():
        g_ema.eval()
#        for i in tqdm(range(args.pics)):
        if sample_z is None:
            if prev_sample_z is None or prev_coef == 0.0:
                sample_z = torch.randn(n_samples, N_LATENT, device=device)
            else:
                sample_z = prev_sample_z + torch.randn(n_samples, N_LATENT, device=device) * prev_coef

        sample, _ = g_ema([sample_z], truncation=truncation, truncation_latent=mean_latent)

        prev_sample_z = sample_z
        
#        np.save(f'./tmp/{epoch}/mel_80_320/{i}.npy', sample.squeeze().data.cpu().numpy())
#        print(sample)

        #audiopaths, imagepaths = [], []
        imagepath = f'/tmp/img_{random.randint(0, 10000)}.png'
        utils.save_image(sample, imagepath, nrow=1, normalize=True,range=(-1, 1))
    
        # for i in range(n_samples):
        filepaths = []
        randid = random.randint(0, 10000)
        for i in range(n_samples):
            audio_output = vocode(sample[i])
        # outputs = torch.vstack(outputs)
        # print(sample.shape, outputs.shape)
            filepath = f'/tmp/gem_{randid}_{i}.wav'
            sf.write(filepath, audio_output.squeeze().detach().cpu().numpy(), SR)
            filepaths.append(filepath)
        return filepaths, imagepath
            # sf.write(f'{args.store_path}/{epoch}/{i}.wav', audio_output.squeeze().detach().cpu().numpy(), sr)
            # print('generate {}th wav file'.format(i))

In [13]:
import math

z_coord = torch.randn(4, N_LATENT, device=device_name)

def coord_to_z(x, y):
    assert x >= 0 and x <= 1
    assert y >= 0 and y <= 1

    z =  ((1-math.sqrt(x**2 + y**2)) * z_coord[0] + (1 - math.sqrt((1-x)**2+y**2)) * z_coord[1] + \
    (1 - math.sqrt(x**2 + (1-y)**2)) * z_coord[2] + (1 - math.sqrt((1-x)**2+(1-y)**2)) * z_coord[3]) 
    z = torch.unsqueeze(z, 0)
    return z

In [14]:
# sample_z = coord_to_z(0, 0)
# print(sample_z[:10], z_coord[0][:10])
# sample_z = coord_to_z(0, 1)
# print(sample_z[:10], z_coord[1][:10])

In [15]:

# sample_z = coord_to_z(0, 0.5)
# audio_output = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=sample_z)
# audio_output.shape

# Audio(audio_output, sr=SR)

In [16]:
# from vscode_audio import *

# Audio(audio_output, sr=SR)

# Audio(audio_output, sr=SR)

In [17]:
from pythonosc import dispatcher
from pythonosc import osc_server, udp_client
import os, random

client = udp_client.SimpleUDPClient('127.0.0.1', 10018)

def generate_random(unused_addr, prev_coef):
    # try:
    audiopaths, imagepath = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=None, prev_coef=prev_coef) # random sample
    audiopaths.append(imagepath)

    client.send_message("/generated", audiopaths)
    #except Exception as exp:
    # print("Error in /find_loops", exp)        
dispatcher = dispatcher.Dispatcher()
dispatcher.map("/generate", generate_random)

server = osc_server.ThreadingOSCUDPServer(
    ('localhost', 10015), dispatcher)
print("Serving on {}".format(server.server_address))
server.serve_forever()




Serving on ('127.0.0.1', 10015)


