In [11]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
from tqdm import tqdm
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import yaml
import os
import librosa
import soundfile as sf
import numpy as np
from utils import *

from vscode_audio import *



In [3]:
N_LATENT = 512
N_MLP = 8
SIZE_OUTPUT = 64 # size of output image

CHECKPOINT = "./looperman_one_bar_checkpoint.pt"
DATAPATH = "./data/looperman/"

TRUNCATION = 1
TRUNCATION_MEAN = 4096

SR = 44100

device_name = "cpu"


In [4]:


generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
checkpoint = torch.load(CHECKPOINT, map_location=torch.device('cpu'))

generator.load_state_dict(checkpoint["g_ema"], strict=False)


if TRUNCATION < 1:
    with torch.no_grad():
        mean_latent = generator.mean_latent(TRUNCATION_MEAN)
else:
    mean_latent = None


In [5]:
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', 'best_netG.pt')
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device('cpu')), strict=False)

    return vocoder, v_mean, v_std
VOCODER, V_MEAN, V_STD = load_vocoder(device_name)

In [6]:
def vocode(sample, vocoder=VOCODER, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output

In [7]:
def generate(g_ema, device, mean_latent, ckpt_name, sample_z = None, truncation=TRUNCATION):
    epoch = ckpt_name.split('.')[0]

    # os.makedirs(f'./tmp/{epoch}', exist_ok=True)
    # os.makedirs(f'./tmp/{epoch}/mel_80_320', exist_ok=True)


    with torch.no_grad():
        g_ema.eval()
#        for i in tqdm(range(args.pics)):
        if sample_z is None:
            sample_z = torch.randn(1, N_LATENT, device=device)

        sample, _ = g_ema([sample_z], truncation=truncation, truncation_latent=mean_latent)
#        np.save(f'./tmp/{epoch}/mel_80_320/{i}.npy', sample.squeeze().data.cpu().numpy())
        # print(sample)

        # utils.save_image(
        #     sample,
        #     f"./tmp/{epoch}/{str(0).zfill(6)}.png",
        #     nrow=1,
        #     normalize=True,
        #     range=(-1, 1),
        # )
        audio_output = vocode(sample)
        return audio_output.squeeze().cpu().numpy()
            # sf.write(f'{args.store_path}/{epoch}/{i}.wav', audio_output.squeeze().detach().cpu().numpy(), sr)
            # print('generate {}th wav file'.format(i))

In [14]:
import math

z_coord = torch.randn(4, N_LATENT, device=device_name)

def coord_to_z(x, y):
    assert x >= 0 and x <= 1
    assert y >= 0 and y <= 1

    z =  ((1-math.sqrt(x**2 + y**2)) * z_coord[0] + (1 - math.sqrt((1-x)**2+y**2)) * z_coord[1] + \
    (1 - math.sqrt(x**2 + (1-y)**2)) * z_coord[2] + (1 - math.sqrt((1-x)**2+(1-y)**2)) * z_coord[3]) 
    z = torch.unsqueeze(z, 0)
    return z

In [15]:
sample_z = coord_to_z(0, 0)
print(sample_z[:10], z_coord[0][:10])
sample_z = coord_to_z(0, 1)
print(sample_z[:10], z_coord[1][:10])

tensor([[-5.4225e-01, -1.2439e+00,  4.4051e-01, -5.4255e-01,  6.4718e-01,
          6.4411e-01, -4.8992e-01, -1.7818e-01, -1.8170e+00, -4.4962e-01,
         -2.1861e+00,  9.8345e-01, -1.4309e+00, -1.5730e+00,  1.3625e+00,
          4.5143e-01,  9.4706e-03,  9.8374e-01, -3.1166e-01, -4.5704e-01,
          3.8003e-01,  7.0931e-01, -4.4143e-01, -1.5496e+00,  2.0665e+00,
         -2.4195e+00,  1.8569e-01, -4.8350e-01, -1.5183e-01,  5.3861e-01,
          6.2562e-01, -3.4609e-01,  1.5957e+00,  1.8704e+00, -1.9565e+00,
          2.4411e+00, -5.8300e-01, -1.4639e+00,  1.7961e+00, -1.5334e+00,
         -3.3384e-01,  8.8696e-01,  3.4104e+00, -1.5283e+00,  4.2649e-01,
         -5.1776e-01,  6.0669e-02,  9.7769e-02, -1.0186e+00, -8.6344e-01,
         -1.4923e+00, -6.9572e-01, -1.4673e+00, -4.5158e-01, -4.9764e-01,
         -1.0536e+00, -4.9828e-02,  7.6105e-01, -2.3314e-01,  1.0616e+00,
         -2.5484e+00, -1.8781e+00, -7.0705e-01, -6.0587e-01, -8.1355e-01,
          1.8630e+00,  5.4978e-03, -5.

In [19]:

sample_z = coord_to_z(0, 0.5)
audio_output = generate(generator, device_name, mean_latent, CHECKPOINT, sample_z=sample_z)
audio_output.shape

Audio(audio_output, sr=SR)

In [23]:
from vscode_audio import *

Audio(audio_output, sr=SR)

Audio(audio_output, sr=SR)

In [20]:
from pythonosc import dispatcher
from pythonosc import osc_server, udp_client
import os

client = udp_client.SimpleUDPClient('127.0.0.1', 10016)
vis_client = udp_client.SimpleUDPClient('10.0.1.102', 10017)

def find_loops(unused_addr, filepath, topN=1, dist_metrics='euclidean'):
    # try:
    embeds, filepaths, rhythm_midi_path = process_recorded_file(filepath, n=topN, metrics=dist_metrics)
    #print(filepaths)
    
    embeds = embeds.tolist()
    client.send_message("/embedding/location", (embeds[0], embeds[1], embeds[2], embeds[3]))
    vis_client.send_message("/embedding/location", (embeds[0], embeds[1], embeds[2], embeds[3]))

    client.send_message("/midi_encode", rhythm_midi_path)
    #except Exception as exp:
    # print("Error in /find_loops", exp)        
dispatcher = dispatcher.Dispatcher()
dispatcher.map("/find_loops", find_loops)

server = osc_server.ThreadingOSCUDPServer(
    ('localhost', 10015), dispatcher)
print("Serving on {}".format(server.server_address))
server.serve_forever()


ModuleNotFoundError: No module named 'pythonosc'