<a href="https://colab.research.google.com/github/mahiidharv/GeneticAlgorithmTSP/blob/master/Copy_of_Style_Transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

#### Loading Libraries

In [0]:
import os
import librosa
from IPython.display import Audio,display
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
import tensorflow as tf
sess = tf.Session()


In [0]:
os.listdir()

In [0]:
!ls ./drive/My\ Drive/Gem/MidiBeethovenandMozart

#### Loading Data from Google Drive

In [0]:
CONTENT_FILENAME = "/content/drive/My Drive/Gem/MidiBeethovenandMozart/beethovenminuetinG.wav"
STYLE_FILENAME = "/content/drive/My Drive/Gem/MidiBeethovenandMozart/Mozartminuetk2.wav" 


In [0]:
#display(Audio(CONTENT_FILENAME))
#display(Audio(STYLE_FILENAME))

In [0]:
N_FFT = 2048
def read_audio_spectum(filename):
    x, fs = librosa.load(filename)
    S = librosa.stft(x, N_FFT)
    print(S.shape)
    p = np.angle(S)
    
    S = np.log1p(np.abs(S[:,:500]))  
    return S, fs

In [0]:
a_content, fs = read_audio_spectum(CONTENT_FILENAME)
a_style, fs = read_audio_spectum(STYLE_FILENAME)

N_SAMPLES = a_content.shape[1]
N_CHANNELS = a_content.shape[0]
print(N_SAMPLES)
print(N_CHANNELS)
a_style = a_style[:N_CHANNELS, :N_SAMPLES]
print(a_content[:5,:5])
print(a_style[:5,:5])
print(a_style.shape)

In [0]:
print(a_style[0:,1])

In [0]:
plt.style.use('classic')

In [0]:
from librosa import display

#### plotting the Spectogram

In [0]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title('Content')
librosa.display.specshow(a_content, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_content[:500,:])
plt.subplot(1, 2, 2)
plt.title('Style')
librosa.display.specshow(a_style, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_style[:500,:])
plt.show()

#### Adding the Filter Initialization

In [0]:
N_FILTERS = 4096

a_content_tf = np.ascontiguousarray(a_content.T[None,None,:,:])
a_style_tf = np.ascontiguousarray(a_style.T[None,None,:,:])

In [0]:
from sys import stderr

In [0]:
std = np.sqrt(2) * np.sqrt(2.0 / ((N_CHANNELS + N_FILTERS) * 11))
kernel = np.random.randn(1, 11, N_CHANNELS, N_FILTERS)*std
x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x")
kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')

#### Creating the Computational Graph

In [0]:
g = tf.Graph()
with g.as_default(), g.device('/gpu:0'), tf.Session() as sess:
    # data shape is "[batch, in_height, in_width, in_channels]",
    x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x")
    print(x)
    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    
    relu_conv = tf.nn.elu(conv)
    net = tf.layers.max_pooling2d(inputs=relu_conv, pool_size=[1, 1], strides=1)
    content_features = net.eval(feed_dict={x: a_content_tf})
    style_features = net.eval(feed_dict={x: a_style_tf})
    print(content_features.shape)
    print(style_features.shape)
    features = np.reshape(style_features, (-1, N_FILTERS))
    print(features.shape)
    style_gram = np.matmul(features.T, features) / N_SAMPLES
    print(style_gram.shape)

#### Minimizing the Total Loss

In [0]:
from sys import stderr

ALPHA= 1e-2
learning_rate= 1e-3
iterations = 1000

result = None
with tf.Graph().as_default():

    # Build graph with variable input
#     x = tf.Variable(np.zeros([1,1,N_SAMPLES,N_CHANNELS], dtype=np.float32), name="x")
    x = tf.Variable(np.random.randn(1,1,N_SAMPLES,N_CHANNELS).astype(np.float32)*1e-3, name="x")
    print(x.shape)
    kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')
    conv = tf.nn.conv2d(
        x,
        kernel_tf,
        strides=[1, 1, 1, 1],
        padding="VALID",
        name="conv")
    
    
    relu_conv = tf.nn.elu(conv)
    net = tf.layers.max_pooling2d(inputs=relu_conv, pool_size=[1, 1], strides=1)
    print(net.get_shape)
    content_loss = ALPHA * 2 * tf.nn.l2_loss(
            net - content_features)

    style_loss = 0

    _, height, width, number = map(lambda i: i.value, net.get_shape())
    print("height=",height)
    print("width=",width)
    print("number=",number)
    size = height * width * number
    feats = tf.reshape(net, (-1, number))
    print(feats.shape)
    gram = tf.matmul(tf.transpose(feats), feats)  / N_SAMPLES
    style_loss = 2 * tf.nn.l2_loss(gram - style_gram)

     # Overall loss
    loss = content_loss + style_loss

    opt = tf.contrib.opt.ScipyOptimizerInterface(
          loss, method='L-BFGS-B', options={'maxiter':1000})
    #opt = tf.contrib.opt.ScipyOptimizerInterface(loss,method='SLSQP',options={'maxiter':1000})
    # Optimization
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
       
        print('Started optimization.')
        opt.minimize(sess)
    
        print ('Final loss:', loss.eval())
        result = x.eval()

#### Inverse FFT to get back Audio

In [0]:
a = np.zeros_like(a_content)
a[:N_CHANNELS,:] = np.exp(result[0,0].T) - 1

# This code is supposed to do phase reconstruction
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in range(500):
    S = a * np.exp(1j*p)
    x = librosa.istft(S)
    p = np.angle(librosa.stft(x, N_FFT))

OUTPUT_FILENAME = '/content/drive/My Drive/Gem/MidiBeethovenandMozart/Style4.wav'
librosa.output.write_wav(OUTPUT_FILENAME, x, fs)

In [0]:
#print( OUTPUT_FILENAME)
#display(Audio(OUTPUT_FILENAME))

#### Plotting the Results

In [0]:
plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.title('Content')
librosa.display.specshow(a_content, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_content[:400,:])
plt.subplot(1,3,2)
plt.title('Style')
librosa.display.specshow(a_style, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a_style[:400,:])
plt.subplot(1,3,3)
plt.title('Result')
librosa.display.specshow(a, y_axis='log',x_axis='time')
plt.colorbar(format='%+2.0f dB')
#plt.imshow(a[:400,:])
plt.show()