# LoopGAN — Loop generation with StyleGAN2 and MelGAN

In [1]:
import argparse
import torch
from torchvision import utils
from model_drum import Generator
import sys
sys.path.append('./melgan')
from modules import Generator_melgan
import os, random
import librosa
import soundfile as sf
import numpy as np
from utils import *
import IPython.display as ipd

### Download a pre-trained model in advance

``` 
$ gdown -O drumbeats1_230000.pt 1B3ZWTJFuZbPPH4uIIz-pCcBTLu9-w4nw
```

In [12]:

# Constants - do not change! 
N_LATENT = 512  # (Fixed)
N_MLP = 8  # (Fixed)
SIZE_OUTPUT = 64 # size of output image (Fixed)
SR = 44100

# Number of samples (loops) in a batch
n_samples = 2  # batch size - 2 for stereo  

# name of pre-trained StyleGAN2 model
CHECKPOINT = "./drumbeats1_230000.pt" 

# mean / std of Spectrograms of training data. used for the conversion from generated spectrograms into wav files
DATAPATH = "./data/drumbeats_1bar/" 

# name of pre-trained MelGAN model
MELGAN_MODEL_NAME = "best_netG.pt"

# Use "cuda" if you have GPUs on your machine
device_name = "cpu"


### Load models

In [14]:
# Load a StyleGAN2 model
def load_gan_generator(device_name):
    generator = Generator(SIZE_OUTPUT, N_LATENT, N_MLP, channel_multiplier=2).to(device_name)
    checkpoint = torch.load(CHECKPOINT, map_location=torch.device(device_name))
    generator.load_state_dict(checkpoint["g_ema"], strict=False)
    return generator
generator = load_gan_generator(device_name)

# Load a MelGAN vocoder model
def load_vocoder(device_name):
    feat_dim = 80
    mean_fp = f'{DATAPATH}/mean.mel.npy'
    std_fp = f'{DATAPATH}/std.mel.npy'
    v_mean = torch.from_numpy(np.load(mean_fp)).float().view(1, feat_dim, 1).to(device_name)
    v_std = torch.from_numpy(np.load(std_fp)).float().view(1, feat_dim, 1).to(device_name)
    
    vocoder_config_fp = './melgan/args.yml'
    vocoder_config = read_yaml(vocoder_config_fp)

    n_mel_channels = vocoder_config.n_mel_channels
    ngf = vocoder_config.ngf
    n_residual_layers = vocoder_config.n_residual_layers

    vocoder = Generator_melgan(n_mel_channels, ngf, n_residual_layers).to(device_name)
    vocoder.eval()

    vocoder_param_fp = os.path.join('./melgan', MELGAN_MODEL_NAME)
    vocoder.load_state_dict(torch.load(vocoder_param_fp, map_location=torch.device(device_name)), strict=False)

    return vocoder, v_mean, v_std

vocoder_model, V_MEAN, V_STD = load_vocoder(device_name)

def vocode(sample, vocoder=vocoder_model, v_mean=V_MEAN, v_std=V_STD):
    de_norm = sample.squeeze(0) * v_std + v_mean
    audio_output = vocoder(de_norm)
    return audio_output



### Generation

To make it easier to handle on Max/MSP, generated loops is saved as a wav file with `n_samples` channels. (default: 4ch)

In [20]:
from pydub import AudioSegment

# main function
# g_ema: stylegan generator
# center_z: to specify the input latent z.  [n_samples, N_LATENT]=[2, 512]
# output_path (optional): to specify the file path of the generated audio file
# variation (optional): the scale of noise added to center_z (= vatiation in a batch) / add noise to create interesting stereo effect
def generate(g_ema=generator, center_z = None, output_path = None, variation=0.10):
    
    with torch.no_grad():
        g_ema.eval()
        
        if center_z is None:
            # random init
            sample_z = torch.randn(1, N_LATENT, device=device_name)
            sample_z = sample_z.repeat(n_samples, 1) +  torch.randn(n_samples, N_LATENT, device=device_name) * variation
        else:
            # use specific latent z
            sample_z = center_z + torch.randn(n_samples, N_LATENT, device=device_name) * variation
        sample_z = sample_z.float()
        
        # Generate!!!
        sample, _ = g_ema([sample_z], truncation=1, truncation_latent=None)
      
        # Saving tje generated spectrogram image
        randid = random.randint(0, 10000)
        imagepath = f'/tmp/img_{randid}.png'
        utils.save_image(sample, imagepath, nrow=1, normalize=True, range=(-1, 1))
    
        # Saving multi channel audio file
        if output_path is None or len(output_path) == 0:
            output_path = f'/tmp/gem_{randid}.wav'
        channels = [] 
        numpy_chs = []  # for numpy array
        # convert (n_samples) spectrogram into audio, one by one 
        for i in range(n_samples):
            audio_output = vocode(sample[i])
            audio_output = audio_output.squeeze().detach().cpu().numpy() 
            numpy_chs.append(audio_output)
            
            channel = AudioSegment( (audio_output*np.iinfo(np.int16).max).astype("int16").tobytes(), sample_width=2, # 16 bit 
                    frame_rate=SR, channels=1)
            channels.append(channel)
        # save as a wav file with (n_samples) channel
        multich = AudioSegment.from_mono_audiosegments(*channels)
        multich.export(output_path, format="wav")
        
        return output_path


### Test

Let's generate random loops with random input latent z

In [33]:
# random generation 
# if you don't specify z, it will be randomly sampled
output_path = generate(variation=0.0)
ipd.display(ipd.Audio(output_path))

#### Stereo effect
Small gausian noise can be added to the input latent vector z. If you play the first and second sample in the generated batch as a stereo audio file, then you'll get an interesting stereo effect. Try different numbers! 

In [24]:
output_path = generate(variation=0.5)
ipd.display(ipd.Audio(output_path))


You can specify input latent z with `center_z` parameter

In [32]:
seed = 1091
torch.manual_seed(seed)

# get random z (with fixed seed)
z = torch.randn(1, N_LATENT, device=device_name)
z = z.repeat(n_samples, 1) # same z for stereo channels

output_path = generate(center_z=z, variation=0.)
ipd.display(ipd.Audio(output_path))


# Interpolation
1. randomly pick 4 latent z vectors 
2. put them at the corners of xy plane  (0. 0.) (0. 1.) (1. 1.) (1. 0.)
3. interplate in the xy plane

In [42]:

# To generate 4 random latent vectors (z) for morphing process
z_presets = np.random.randn(4, N_LATENT) # rondomize!

# Interpolation of z 
def get_center_z(x, y):
    assert x >= 0 and x <= 1.0
    assert y >= 0 and y <= 1.0

    z = (1-x) * (1-y) * z_presets[0] + x * (1-y) * z_presets[1] + (1-x) * y * z_presets[2] + x * y * z_presets[3]
    z = torch.tensor(z, device=device_name).double()
    z = z.repeat(n_samples, 1)
    return z    

# Random interpolation
for x in np.linspace(0, 0.5, 6):
    y = 0.0
    print("interpolation: %.3f %.3f" % (x, y))
    z = get_center_z(x, y)
    output_path = generate(center_z=z, variation=0.)
    ipd.display(ipd.Audio(output_path))

interpolation: 0.000 0.000


interpolation: 0.100 0.000


interpolation: 0.200 0.000


interpolation: 0.300 0.000


interpolation: 0.400 0.000


interpolation: 0.500 0.000
