In [1]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
from tqdm import tqdm
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import yaml
import os
import librosa
import soundfile as sf
import numpy as np
from utils import *

from vscode_audio import *



In [2]:
N_LATENT = 512
N_MLP = 8
SIZE_OUTPUT = 64 # size of output image

CHECKPOINT = "./looperman_one_bar_checkpoint.pt"
DATAPATH = "./data/looperman/"

TRUNCATION = 1
TRUNCATION_MEAN = 4096

SR = 44100

device_name = "cpu"


In [3]:


generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
checkpoint = torch.load(CHECKPOINT, map_location=torch.device('cpu'))

generator.load_state_dict(checkpoint["g_ema"], strict=False)


if TRUNCATION < 1:
    with torch.no_grad():
        mean_latent = generator.mean_latent(TRUNCATION_MEAN)
else:
    mean_latent = None


In [4]:
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', 'best_netG.pt')
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device('cpu')), strict=False)

    return vocoder, v_mean, v_std
VOCODER, V_MEAN, V_STD = load_vocoder(device_name)

In [5]:
def vocode(sample, vocoder=VOCODER, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output

In [6]:
def generate(g_ema, device, mean_latent, ckpt_name, sample_z = None, truncation=TRUNCATION):
    epoch = ckpt_name.split('.')[0]

    # os.makedirs(f'./tmp/{epoch}', exist_ok=True)
    # os.makedirs(f'./tmp/{epoch}/mel_80_320', exist_ok=True)


    with torch.no_grad():
        g_ema.eval()
#        for i in tqdm(range(args.pics)):
        if sample_z is None:
            sample_z = torch.randn(1, N_LATENT, device=device)

        sample, _ = g_ema([sample_z], truncation=truncation, truncation_latent=mean_latent)
#        np.save(f'./tmp/{epoch}/mel_80_320/{i}.npy', sample.squeeze().data.cpu().numpy())
        # print(sample)

        # utils.save_image(
        #     sample,
        #     f"./tmp/{epoch}/{str(0).zfill(6)}.png",
        #     nrow=1,
        #     normalize=True,
        #     range=(-1, 1),
        # )
        audio_output = vocode(sample)
        return audio_output.squeeze().cpu().numpy()
            # sf.write(f'{args.store_path}/{epoch}/{i}.wav', audio_output.squeeze().detach().cpu().numpy(), sr)
            # print('generate {}th wav file'.format(i))

In [7]:
import math

z_coord = torch.randn(4, N_LATENT, device=device_name)

def coord_to_z(x, y):
    assert x >= 0 and x <= 1
    assert y >= 0 and y <= 1

    z =  ((1-math.sqrt(x**2 + y**2)) * z_coord[0] + (1 - math.sqrt((1-x)**2+y**2)) * z_coord[1] + \
    (1 - math.sqrt(x**2 + (1-y)**2)) * z_coord[2] + (1 - math.sqrt((1-x)**2+(1-y)**2)) * z_coord[3]) 
    z = torch.unsqueeze(z, 0)
    return z

In [8]:
sample_z = coord_to_z(0, 0)
print(sample_z[:10], z_coord[0][:10])
sample_z = coord_to_z(0, 1)
print(sample_z[:10], z_coord[1][:10])

tensor([[ 0.2094,  0.3866,  0.7886,  0.1635,  1.0082, -0.8196, -0.1038,  2.0085,
         -2.1548,  0.8463,  0.0132, -1.9810, -0.6655,  2.0665,  0.7414,  0.7535,
         -0.4752, -0.8683,  1.2735, -0.8899, -0.0126, -0.5358,  1.3031,  0.3025,
          1.2199, -1.7744,  0.0565,  1.2219, -1.0541, -0.0783,  1.2499,  0.5942,
          0.3651, -1.1126, -2.2210, -0.3115, -0.6507, -0.2760,  1.6378,  0.1825,
          0.3240, -0.1137, -0.0441,  0.4739,  0.3977, -2.0774,  0.0282, -0.3362,
          1.7041,  1.2501, -0.3809, -0.3759,  1.3588,  0.6635, -0.7365, -0.6704,
         -1.1189,  0.7476, -0.4769,  0.2899,  1.2749,  1.8551, -2.6581, -0.2213,
         -0.2358, -0.0570, -0.6630, -0.9779, -0.3815, -0.9431,  1.8882,  0.4530,
          0.8956, -0.3474,  0.4642,  0.1724,  0.0943,  0.8511, -0.0578,  1.1881,
         -0.4398,  0.8418, -0.2893,  0.8252,  0.3550, -0.0479,  1.1995, -1.7997,
         -0.3478, -1.4985, -0.3690,  1.6057,  0.1179,  1.4274,  1.1932,  0.5787,
         -3.0322,  1.4348, -

In [9]:

sample_z = coord_to_z(0, 0.5)
audio_output = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=sample_z)
audio_output.shape

Audio(audio_output, sr=SR)

In [10]:
from vscode_audio import *

Audio(audio_output, sr=SR)

Audio(audio_output, sr=SR)

In [13]:
from pythonosc import dispatcher
from pythonosc import osc_server, udp_client
import os, random

client = udp_client.SimpleUDPClient('127.0.0.1', 10018)

def generate_random(unused_addr, flag):
    # try:
    audio_output = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=None) # random sample
    
    filepath = f'/tmp/gem_{random.randint(0, 10000)}.wav'
    sf.write(filepath, audio_output, SR)

    client.send_message("/generated", filepath)
    #except Exception as exp:
    # print("Error in /find_loops", exp)        
dispatcher = dispatcher.Dispatcher()
dispatcher.map("/generate", generate_random)

server = osc_server.ThreadingOSCUDPServer(
    ('localhost', 10012), dispatcher)
print("Serving on {}".format(server.server_address))
server.serve_forever()


Serving on ('127.0.0.1', 10012)


In [12]:
!pip install python-osc

