### Introduction

https://github.com/pkmital/time-domain-neural-audio-style-transfer

In [None]:
import numpy as np
import librosa
import warnings

from IPython.display import Audio, display
from models import timedomain, ulyanov

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
def plot_spec(audio):
  D = librosa.amplitude_to_db(librosa.stft(audio), ref=np.max)
  librosa.display.specshow(D)

In [None]:
sr = 44100
content = './wavs/corpus/johntejada-1.wav'
style = './wavs/target/beat-box-2.wav'

#content = "./wavs/songs/imperial.mp3"
#style = "./wavs/songs/usa.mp3"

In [None]:
style_audio, _ = librosa.core.load(style, sr=sr)
plot_spec(style_audio)
display(Audio(style_audio, rate=sr))

In [None]:
content_audio, _ = librosa.core.load(content, sr=sr)
plot_spec(content_audio)
display(Audio(content_audio, rate=sr))

In [None]:
synth_sr, synth_audio = timedomain.run(
    content,
    style,
    output_fname='outs/timedomain_out.wav',
    n_fft=2048,          # 512 to sr / 2. Higher is better quality but is slower.
    n_layers=1,          # 1 to 3. Higher is better quality but is slower.
    n_filters=4096,      # 512 - 4096. Higher is better quality but is slower.
    hop_length=256,      # 256 to n_fft / 2. The lower this value, the better the temporal resolution.
    alpha=0.0005,        # 0.0001 to 0.01. The higher this value, the more of the original "content" bleeds through.
    k_w=3,               # 3 to 5. The higher this value, the more complex the patterns it can synthesize.
    iterations=300,      # 100 to 1000. Higher is better quality but is slower.
    stride=1,            # 1 to 3. Lower is better quality but is slower.
    sr=sr,
)

In [None]:
plot_spec(synth_audio)
display(Audio(synth_audio, rate=synth_sr))

In [None]:
synth_sr, synth_audio = ulyanov.run(
    content,
    style,
    output_fname='outs/uly_out.wav',
    alpha=0.001,
    iterations=128,
    phase_iterations=256,
    sr=sr,
)

In [None]:
plot_spec(synth_audio)
display(Audio(synth_audio, rate=synth_sr))