In [None]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import tensorflow as tf
import warnings

from IPython.display import Audio, display

%matplotlib inline

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

### Load style and content

In [None]:
CONTENT_FILENAME = "wavs/songs/imperial.mp3"
STYLE_FILENAME = "wavs/songs/usa.mp3"

#CONTENT_FILENAME = './wavs/corpus/johntejada-1.wav'
#STYLE_FILENAME = './wavs/target/beat-box-2.wav'

In [None]:
display(Audio(CONTENT_FILENAME))
display(Audio(STYLE_FILENAME))

In [None]:
# Reads wav file and produces spectrum
# Fourier phases are ignored
N_FFT = 2048
def read_audio_spectum(filename, nfft=N_FFT):
    x, fs = librosa.load(filename)
    S = librosa.stft(x, n_fft=nfft)
    p = np.angle(S)

    S = np.log1p(np.abs(S[:, :430]))  
    return S, fs

In [None]:
a_content, fs = read_audio_spectum(CONTENT_FILENAME, N_FFT)
a_style, fs = read_audio_spectum(STYLE_FILENAME, N_FFT)

N_SAMPLES = min(a_style.shape[1], a_content.shape[1])
N_CHANNELS = min(a_style.shape[0], a_content.shape[0])

a_content = a_content[:N_CHANNELS, :N_SAMPLES]
a_style = a_style[:N_CHANNELS, :N_SAMPLES]

### Visualize spectrograms for content and style tracks

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title('Content')
plt.imshow(a_content[:400,:])
plt.subplot(1, 2, 2)
plt.title('Style')
plt.imshow(a_style[:400,:])
plt.show()

### Compute content and style feats

In [None]:
N_FILTERS = 4096

a_content_tf = np.ascontiguousarray(a_content.T[None,None,:,:])
a_style_tf = np.ascontiguousarray(a_style.T[None,None,:,:])

# filter shape is "[filter_height, filter_width, in_channels, out_channels]"
std = np.sqrt(2) * np.sqrt(2.0 / ((N_CHANNELS + N_FILTERS) * 11))
kernel = np.random.randn(1, 11, N_CHANNELS, N_FILTERS) * std

filter_g = tf.Graph()
with filter_g.as_default(), filter_g.device('/cpu:0'), tf.Session() as sess:
    # data shape is "[batch, in_height, in_width, in_channels]",
    x = tf.placeholder('float32', [1, 1, N_SAMPLES, N_CHANNELS], name="x")

    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    
    net = tf.nn.relu(conv)

    content_features = net.eval(feed_dict={x: a_content_tf})
    style_features = net.eval(feed_dict={x: a_style_tf})
    
    features = np.reshape(style_features, (-1, N_FILTERS))
    style_gram = np.matmul(features.T, features) / N_SAMPLES

### Optimize

In [None]:
ALPHA = 1e-2
MAX_ITER = 500

result = None

gen_g = tf.Graph()
with gen_g.as_default(), gen_g.device('/cpu:0'):
    x = tf.Variable(np.random.randn(1, 1, N_SAMPLES, N_CHANNELS).astype(np.float32)*1e-3, name="x")

    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")

    net = tf.nn.relu(conv)

    content_loss = ALPHA * 2 * tf.nn.l2_loss(net - content_features)

    style_loss = 0

    _, height, width, channels = map(lambda i: i.value, net.get_shape())

    size = height * width * channels
    feats = tf.reshape(net, (-1, channels))
    gram = tf.matmul(tf.transpose(feats), feats) / N_SAMPLES
    style_loss = 2 * tf.nn.l2_loss(gram - style_gram)

    # Overall loss
    loss = content_loss + style_loss

    opt = tf.contrib.opt.ScipyOptimizerInterface(
          loss, method='L-BFGS-B', options={'maxiter': MAX_ITER})

    # Optimization
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())

        print('Started optimization')
        opt.minimize(sess)

        print('Final loss:', loss.eval())
        result = x.eval()

### Invert spectrogram and save the result

In [None]:
a = np.zeros_like(a_content)
a[:N_CHANNELS,:] = np.exp(result[0,0].T) - 1

# phase reconstruction
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(128):
    S = a * np.exp(1j*p)
    x = librosa.istft(S)
    p = np.angle(librosa.stft(x, n_fft=N_FFT))

#OUTPUT_FILENAME = 'outputs/john1-box2-500.wav'
#sf.write(OUTPUT_FILENAME, x, fs)

In [None]:
display(Audio(x, rate=fs))

In [None]:
print(OUTPUT_FILENAME)
display(Audio(OUTPUT_FILENAME))

### Visualize spectrograms

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.title('Content')
plt.imshow(a_content[:400,:])
plt.subplot(1,3,2)
plt.title('Style')
plt.imshow(a_style[:400,:])
plt.subplot(1,3,3)
plt.title('Result')
plt.imshow(a[:400,:])
plt.show()