In [1]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
from tqdm import tqdm
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import yaml
import os
import librosa
import soundfile as sf
import numpy as np
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
N_LATENT = 512
N_MLP = 8
SIZE_OUTPUT = 64 # size of output image

CHECKPOINT = "./looperman_one_bar_checkpoint.pt"
DATAPATH = "./data/looperman/"

TRUNCATION = 1
TRUNCATION_MEAN = 4096

SR = 44100

device_name = "cpu"


In [8]:


generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
checkpoint = torch.load(CHECKPOINT, map_location=torch.device('cpu'))

generator.load_state_dict(checkpoint["g_ema"], strict=False)


if TRUNCATION < 1:
    with torch.no_grad():
        mean_latent = generator.mean_latent(TRUNCATION_MEAN)
else:
    mean_latent = None


In [9]:
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', 'best_netG.pt')
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device('cpu')), strict=False)

    return vocoder, v_mean, v_std
VOCODER, V_MEAN, V_STD = load_vocoder(device_name)

In [10]:
def vocode(sample, vocoder=VOCODER, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output

In [13]:
def generate(g_ema, device, mean_latent, ckpt_name, sample_z = None, truncation=TRUNCATION):
    epoch = ckpt_name.split('.')[0]

    # os.makedirs(f'./tmp/{epoch}', exist_ok=True)
    # os.makedirs(f'./tmp/{epoch}/mel_80_320', exist_ok=True)


    with torch.no_grad():
        g_ema.eval()
#        for i in tqdm(range(args.pics)):
        if sample_z is None:
            sample_z = torch.randn(1, N_LATENT, device=device)

        sample, _ = g_ema([sample_z], truncation=truncation, truncation_latent=mean_latent)
#        np.save(f'./tmp/{epoch}/mel_80_320/{i}.npy', sample.squeeze().data.cpu().numpy())
        # print(sample)

        # utils.save_image(
        #     sample,
        #     f"./tmp/{epoch}/{str(0).zfill(6)}.png",
        #     nrow=1,
        #     normalize=True,
        #     range=(-1, 1),
        # )
        audio_output = vocode(sample)
        return audio_output.squeeze().cpu().numpy()
            # sf.write(f'{args.store_path}/{epoch}/{i}.wav', audio_output.squeeze().detach().cpu().numpy(), sr)
            # print('generate {}th wav file'.format(i))

In [41]:
import math

z_coord = torch.randn(4, N_LATENT, device=device_name)

def coord_to_z(x, y):
    assert x >= 0 and x <= 1
    assert y >= 0 and y <= 1

    return ((1-math.sqrt(x**2 + y**2)) * z_coord[0] + (1 - math.sqrt((1-x)**2+y**2)) * z_coord[1] + \
    (1 - math.sqrt(x**2 + (1-y)**2)) * z_coord[2] + (1 - math.sqrt((1-x)**2+(1-y)**2)) * z_coord[3]) 

In [42]:
sample_z = coord_to_z(0, 0)
print(sample_z[:10], z_coord[0][:10])
sample_z = coord_to_z(0, 1)
print(sample_z[:10], z_coord[1][:10])

tensor([-1.6623, -0.7679, -1.2936, -0.4928, -2.1970, -0.5257,  0.0502, -1.1888,
         2.4618,  1.3922]) tensor([-1.1353, -1.2258, -0.5625, -0.8718, -1.6963, -0.7334,  0.4757, -0.9404,
         2.2476,  0.9442])
tensor([ 1.9554, -1.3748, -0.5211,  1.8020, -0.3428,  0.7335,  0.0226,  1.3370,
        -0.6853,  1.6763]) tensor([-0.3974,  0.9159, -1.3090, -0.2367, -0.1081,  0.2440, -0.9946, -0.0798,
         0.0587,  1.8916])


In [22]:

audio_output = generate(generator, device_name, mean_latent, CHECKPOINT)
audio_output.shape

tensor([[[[ 0.9705,  1.1472,  1.1440,  ...,  0.7225,  0.6922,  0.7820],
          [ 1.2935,  1.2985,  1.0406,  ...,  0.8812,  0.9145,  0.8953],
          [ 1.1250,  1.1472,  0.9669,  ...,  0.5601,  0.4380,  0.5690],
          ...,
          [ 1.4754,  1.8749,  2.3694,  ..., -0.3228, -0.4444,  0.5347],
          [ 1.6220,  2.0324,  2.4769,  ..., -0.5642, -0.5604,  0.5585],
          [ 1.5314,  2.0066,  2.5540,  ..., -0.8084, -0.5654,  0.5473]]]])


(81920,)

In [23]:
from vscode_audio import *

Audio(audio_output, sr=SR)

Audio(audio_output, sr=SR)