In [1]:
import tensorflow as tf
import tensorflow_io as tfio
#import tensorflow_hub as hub
import numpy as np
import csv
import io
import librosa
import matplotlib.pyplot as plt
import IPython.display as display
from IPython.display import Audio
import scipy.io
import PIL.Image
import vggish_keras as vgk

In [2]:
# Modified from tutorial: https://www.tensorflow.org/tutorials/generative/style_transfer

In [3]:
# https://stackoverflow.com/questions/56719138/how-can-i-save-a-librosa-spectrogram-plot-as-a-specific-sized-image
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def preprocess(mels):
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy
    return img

In [4]:
def preprocess_spectrogram(img_arr):
    max_dim = 512
    img = img_arr
    
    # convert into 3D
    img = img.reshape((img.shape[0], img.shape[1], 1))
    img = np.tile(img, 3)
    
#     print('Image Shape', img.shape)
    img = tf.image.convert_image_dtype(img, tf.float32)
#     print('Image shape2', img.shape)
    shape = tf.cast(tf.shape(img)[:-1], tf.float32)
#     print('Shape', shape)
    long_dim = max(shape)
#     print('Longest Dim', long_dim)
    
    scale = max_dim / long_dim
    new_shape = tf.cast(shape * scale, tf.int32)
#     print('New Shape', new_shape)
    img = tf.image.resize(img, new_shape)
    img = img[tf.newaxis, :]
#     print('Final:', img.shape)
    return img

In [5]:
def load_img(path_to_img):
    """Preprocessing for VGG"""
    max_dim = 512
    img = tf.io.read_file(path_to_img)
    img = tf.image.decode_image(img, channels=3)
#     print(img.shape)
    img = tf.image.convert_image_dtype(img, tf.float32)
    shape = tf.cast(tf.shape(img)[:-1], tf.float32)
#     print(shape)
    long_dim = max(shape)
#     print(long_dim)
    scale = max_dim / long_dim
    new_shape = tf.cast(shape * scale, tf.int32)
    img = tf.image.resize(img, new_shape)
#     print(shape)
    img = img[tf.newaxis, :]
    return img

def imshow(image, title=None):
    if len(image.shape) > 3:
        image = tf.squeeze(image, axis=0)
        plt.imshow(image)
    if title:
        plt.title(title)

In [6]:
import pandas as pd
df_test_metadata = pd.read_csv('test_vox1_metadata.csv')

In [7]:
df_test_metadata

Unnamed: 0.1,Unnamed: 0,VoxCeleb1 ID,VGGFace1 ID,Gender,Nationality,Set,full_paths
0,269,id10270,Eartha_Kitt,f,USA,test,./data/spmel_test/id10270/x6uYqmx31kE_00002.npy
1,270,id10271,Ed_Westwick,m,UK,test,./data/spmel_test/id10271/zWyD72sHVwA_00001.npy
2,271,id10272,Eddie_Griffin,m,USA,test,./data/spmel_test/id10272/wb6ligRbbZ4_00001.npy
3,272,id10273,Eddie_Izzard,m,UK,test,./data/spmel_test/id10273/xN2fSTQHHkw_00002.npy
4,273,id10274,Eddie_Kaye_Thomas,m,USA,test,./data/spmel_test/id10274/xbGp2KSUr0E_00001.npy
5,274,id10275,Eddie_McClintock,m,USA,test,./data/spmel_test/id10275/mWZ3l5OoyVI_00002.npy
6,275,id10276,Edgar_Wright,m,UK,test,./data/spmel_test/id10276/3vWez3baO60_00002.npy
7,276,id10277,Eduardo_Noriega,m,Spain,test,./data/spmel_test/id10277/cELY3LdIo-0_00003.npy
8,277,id10278,Edward_Asner,m,USA,test,./data/spmel_test/id10278/y990f6UiEEM_00001.npy
9,278,id10279,Efren_Ramirez,m,USA,test,./data/spmel_test/id10279/4Q1IvdayPR8_00014.npy


In [8]:
import pandas as pd
df_test_metadata = pd.read_csv('test_vox1_metadata.csv')
images = {}
for row in df_test_metadata.iterrows():
    row_item = row[1]
    user_id = row_item['VoxCeleb1 ID']
    images[user_id] = row_item['full_paths'].replace('spmel_test','wavtest').replace('_0','/0')[:-4]+'.wav'
#     fig, ax = plt.subplots(2,1)
#     fig.suptitle(f'Image:{user_id}')
#     ax[0].imshow(np.load(row_item['full_paths']).T)
#     ax[0].set_title('Original')
    
#     ax[1].imshow(images[user_id][0])
#     ax[1].set_title('Preprocessed')

In [9]:
import itertools
image_combinations = list(map(dict, itertools.combinations(
    images.items(), 2)))

In [10]:
image_combinations

[{'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10271': './data/wavtest/id10271/zWyD72sHVwA/00001.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10272': './data/wavtest/id10272/wb6ligRbbZ4/00001.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10273': './data/wavtest/id10273/xN2fSTQHHkw/00002.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10274': './data/wavtest/id10274/xbGp2KSUr0E/00001.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10275': './data/wavtest/id10275/mWZ3l5OoyVI/00002.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10276': './data/wavtest/id10276/3vWez3baO60/00002.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10277': './data/wavtest/id10277/cELY3LdIo-0/00003.wav'},
 {'id10270': './data/wavtest/id10270/x6uYqmx31kE/00002.wav',
  'id10278': './data/wavtest/id10278/y990f6UiEEM/00001.wav'},
 {'id10270': './

In [11]:
content_layers = ['conv4/conv4_2'] 

style_layers = ['conv1',
                'conv2',
                'conv3/conv3_1', 
                'conv3/conv3_2', 
                'conv4/conv4_1']

In [12]:
num_content_layers = len(content_layers)
num_style_layers = len(style_layers)

In [58]:
def vgg_layers(layer_names):
    """ Creates a vgg model that returns a list of intermediate output values."""
    # Load our model. Load pretrained VGG, trained on imagenet data
    vgg = vgk.VGGish(include_top=False, weights='audioset')
    #get activations from layers we're interested in
    outputs = [vgg.get_layer(name).output for name in layer_names]
    model = tf.keras.Model([vgg.input], outputs)
    return model

In [59]:
def gram_matrix(input_tensor):
    result = tf.linalg.einsum('bijc,bijd->bcd', input_tensor, input_tensor)
    input_shape = tf.shape(input_tensor)
    num_locations = tf.cast(input_shape[1]*input_shape[2], tf.float32)
    return result/(num_locations)

In [60]:
class StyleContentModel(tf.keras.models.Model):
    def __init__(self, style_layers, content_layers):
        super(StyleContentModel, self).__init__()
        self.vgg = vgg_layers(style_layers+content_layers)
        self.style_layers = style_layers
        self.content_layers = content_layers
        self.num_style_layers = len(style_layers)
        self.vgg.trainable = False 
        
    def call(self, inputs):
        """Expects float input in [0,1]"""
        outputs = self.vgg(inputs)
        style_outputs, content_outputs = (outputs[:self.num_style_layers], 
                                      outputs[self.num_style_layers:])
        style_outputs = [gram_matrix(style_output) for style_output in style_outputs]
        content_dict = {content_name:value for content_name,value in zip(self.content_layers, content_outputs)}
        style_dict = {
            style_name:value 
            for style_name, value
            in zip(self.style_layers, style_outputs)
        }
        return {'content': content_dict, 'style': style_dict}

In [61]:
def tensor_to_image(tensor):
    tensor = tensor*255
    tensor = np.array(tensor, dtype=np.uint8)
    if np.ndim(tensor)>3:
        assert tensor.shape[0] == 1
        tensor = tensor[0]
    return PIL.Image.fromarray(tensor)

In [62]:
def tensor_to_audio(tensor):
    tensor = tensor*255
    tensor = np.array(tensor, dtype=np.uint8)
    if np.ndim(tensor)>3:
        assert tensor.shape[0] == 1
        tensor = tensor[0]
    tensor = tf.reduce_mean(tensor,axis=-1)
    audio_arr = librosa.feature.inverse.mel_to_audio(tensor.numpy().astype(np.float64), sr=16000, n_fft=1024, hop_length=256, n_iter=64)
    return audio_arr

In [63]:
def get_audio_data(pair):

    def clip_0_1(image):
        return tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=1.0)

    def style_content_loss(outputs):
        style_outputs = outputs['style']
        print(style_outputs)
        content_outputs = outputs['content']
        print(content_outputs)
        style_loss = 0 
        
        for name in style_outputs.keys():
            print(tf.reduce_mean((style_outputs[name]-style_targets[name])**2))
                  
        style_loss = tf.add_n([tf.reduce_mean((style_outputs[name]-style_targets[name])**2) for name in style_outputs.keys()])
        
        
        print('Style Loss', style_loss)
        content_loss = tf.add_n([tf.reduce_mean((content_outputs[name]-content_targets[name])**2) 
                                 for name in content_outputs.keys()])
        print('Content Loss', content_loss)
        content_loss *= content_weight / num_content_layers
        loss = style_loss + content_loss
        return loss

    @tf.function()
    def train_step(image):
        with tf.GradientTape() as tape:
            outputs = extractor(image)
            loss = style_content_loss(outputs)
        grad = tape.gradient(loss, image)
        opt.apply_gradients([(grad, image)])
        image.assign(clip_0_1(image))
    
    
    
    opt = tf.optimizers.Adam(learning_rate=0.02, beta_1=0.99, epsilon=1e-1)
    image = None
    style_weight=1e-2
    content_weight=1e4
    
    content_image_id, style_image_id = pair.keys()
    content_image = vgk.get_features(pair[content_image_id])
    style_image =  vgk.get_features(pair[style_image_id])
    extractor = StyleContentModel(style_layers, content_layers)
    style_targets = extractor(style_image)['style']
    content_targets = extractor(content_image)['content']
    
    if image is None:
        image = tf.Variable(content_image)
    else:
        image.assign(content_image)
    audio_arr = None
    spec_img = None
    for n in range(epochs):
        for m in range(steps_per_epoch):
            train_step(image)
            print(".", end='')
        
        spec_img = image[:,:,0]
        audio_arr = tensor_to_audio(image)
        print("Train step: {}".format(m))
    filename = f'{content_image_id}x{style_image_id}_vggish'
    print(filename)
    if audio_arr is not None:
        scipy.io.wavfile.write(filename+'.wav', 16000, audio_arr)
        plt.imsave(filename+'.png', np.array(spec_img)[0])

In [57]:
extractor = StyleContentModel(style_layers, content_layers)

In [66]:
epochs = 1 # 10
steps_per_epoch = 10 #100

for pair in image_combinations:
    print(pair.values())
    get_audio_data(pair)
#     content_image_id, style_image_id = pair.keys()
#     content_image = vgk.get_features(pair[content_image_id])
#     print(content_image.shape)
    
#     style_image =  vgk.get_features(pair[style_image_id])
#     print(style_image.shape)
    
#     style_targets = extractor(style_image)['style']
#     print(style_targets.keys())
#     content_targets = extractor(content_image)['content']
#     print(content_targets.keys())

dict_values(['./data/wavtest/id10270/x6uYqmx31kE/00002.wav', './data/wavtest/id10271/zWyD72sHVwA/00001.wav'])
(1, 96, 64, 1)
(3, 96, 64, 1)
{'conv1': <tf.Tensor 'style_content_model_32/truediv:0' shape=(1, 64, 64) dtype=float32>, 'conv2': <tf.Tensor 'style_content_model_32/truediv_1:0' shape=(1, 128, 128) dtype=float32>, 'conv3/conv3_1': <tf.Tensor 'style_content_model_32/truediv_2:0' shape=(1, 256, 256) dtype=float32>, 'conv3/conv3_2': <tf.Tensor 'style_content_model_32/truediv_3:0' shape=(1, 256, 256) dtype=float32>, 'conv4/conv4_1': <tf.Tensor 'style_content_model_32/truediv_4:0' shape=(1, 512, 512) dtype=float32>}
{'conv4/conv4_2': <tf.Tensor 'style_content_model_32/functional_65/conv4/conv4_2/Relu:0' shape=(1, 12, 8, 512) dtype=float32>}
Tensor("Mean:0", shape=(), dtype=float32)
Tensor("Mean_1:0", shape=(), dtype=float32)
Tensor("Mean_2:0", shape=(), dtype=float32)
Tensor("Mean_3:0", shape=(), dtype=float32)
Tensor("Mean_4:0", shape=(), dtype=float32)
Style Loss Tensor("AddN:0", s

KeyboardInterrupt: 