## Colab Configuration and Imports

In [None]:
!pip install --upgrade pytube

Collecting pytube
  Downloading pytube-11.0.1-py3-none-any.whl (56 kB)
[?25l[K     |█████▉                          | 10 kB 30.3 MB/s eta 0:00:01[K     |███████████▋                    | 20 kB 22.1 MB/s eta 0:00:01[K     |█████████████████▌              | 30 kB 10.9 MB/s eta 0:00:01[K     |███████████████████████▎        | 40 kB 9.0 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 51 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████████| 56 kB 2.6 MB/s 
[?25hInstalling collected packages: pytube
Successfully installed pytube-11.0.1


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import tensorflow as tf

import numpy as np
import librosa
import librosa.display
import sklearn
from tqdm import tqdm
from datetime import datetime
import os
import time
import sys
import pandas as pd
import scipy as sp
from scipy.stats import truncnorm
import moviepy.editor as mpy
import random
import os
import subprocess
import pytube

# filter out warnings regarding librosa.load for mp3s
import warnings
warnings.filterwarnings('ignore', '.*PySoundFile failed. Trying audioread instead*', )


## Download the Valence-Arousal inference model


In [None]:
#@markdown Setting the **vggish** to True allows to load and use the pre-trained 
#@markdown Vggish model to perform the Valence-Arousal regression of the audiotrack.
#@markdown If **vggish** is False, then the MEL CNN Model is loaded.
vggish = False #@param ["False", "True"] {type:"raw"} 

if vggish:
  !wget --no-check-certificate -r "https://drive.google.com/uc?export=download&id=1-DRbXRCEkB8qBZRdcE5miUW32szmiJy0" -O "VGGish_VA_model.h5"
else:
  !wget --no-check-certificate -r "https://drive.google.com/uc?export=download&id=1-sd4wy_OSB7ZoqSXkg-qOSLGwEiUqymA" -O "VA_model.h5"


will be placed in the single file you specified.

--2021-09-08 12:00:08--  https://drive.google.com/uc?export=download&id=1-sd4wy_OSB7ZoqSXkg-qOSLGwEiUqymA
Resolving drive.google.com (drive.google.com)... 108.177.111.138, 108.177.111.139, 108.177.111.100, ...
Connecting to drive.google.com (drive.google.com)|108.177.111.138|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-00-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/s9ip8hka7qa0kgnqvtakr3pkm1sahb33/1631102400000/06980489334440521129/*/1-sd4wy_OSB7ZoqSXkg-qOSLGwEiUqymA?e=download [following]
--2021-09-08 12:00:09--  https://doc-00-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/s9ip8hka7qa0kgnqvtakr3pkm1sahb33/1631102400000/06980489334440521129/*/1-sd4wy_OSB7ZoqSXkg-qOSLGwEiUqymA?e=download
Resolving doc-00-ak-docs.googleusercontent.com (doc-00-ak-docs.googleusercontent.com)... 142.250.152.132, 2607:f8b0:4001:c56::84
Con

## Load Model for the inference **of** Valence-Arousal 

In [None]:
if vggish:
  # Load the model VGGish CNN
  model = tf.keras.models.load_model('/content/VGGish_VA_model.h5')
else:
  # Load the model MEL CNN 
  model = tf.keras.models.load_model('/content/VA_model.h5')

## Load the audio file to be used for the video creation

Load the file from directory

In [None]:
from google.colab import files 
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Or load the file using a youtube link

In [None]:
link = "https://www.youtube.com/watch?v=ho9rZjlsyYY&ab_channel=MovieMongerHZ"
filename = "/content/audiofile.mp4"
pytube.YouTube(link).streams.filter(only_audio=True).all()[1].download(filename=filename)

'/content/audiofile.mp4'

In [None]:
#@markdown Duration of the video output in seconds.
#@markdown It can be useful to generate shorter videos while you are tweaking the other visualizer parameters. 
#@markdown Once you find your preferred parameters, remove the duration argument to generate the video but for the full duration of the song (setting Duration = None).
duration =  60#@param type number

if vggish:
  audio, sample_rate = librosa.load(filename, duration=duration,sr=16000)
else:
  audio, sample_rate = librosa.load(filename, duration=duration)

import soundfile as sf
sf.write('audiofile.wav', audio, sample_rate, 'PCM_24')

audio_duration = np.floor(len(audio)/sample_rate)

## Analyse the audio file in segments and compute the Valence-Arousal values for each segment

In [None]:
# CNN input layer dimensions
spec_h = 128
spec_w = 216

# normalizing the audio waveform
audio = np.expand_dims(audio, axis=1)

if vggish:
  # Audio VGG is scaled at [-1.0,1.0] as per embedding input requirement
  audio = sklearn.preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)).fit_transform(audio)
else:
  # Audio is normalized with standard scaler
  audio = sklearn.preprocessing.StandardScaler().fit_transform(audio)

audio = audio.flatten()

waveforms = [] #VGGish INPUT 
specs = [] #CNN MEL INPUT

# We split the audio in 5-second parts and compute the corresponding Mel Spectrogram in dB units. 
step = 5 
for index, j in enumerate(tqdm(range(0,int(np.floor(len(audio)/sample_rate)), step))):
  signal = audio[j * sample_rate : (j+step) * sample_rate]

  if vggish:
    # We pad each signal with zeros so to fit array with fixed dimensions
    waveform = np.pad(signal,(0, step*sample_rate - signal.shape[0]))
    waveforms.append(waveform)


  # We extract the mel spectrogram (mel scale magnitude) in decibel (dB) units 
  spec = librosa.power_to_db(librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=128, hop_length=int(2048/4)), ref=np.max)
  
  # Resize the mel spectrogram to match the CNN input dimensions
  spec = np.expand_dims(spec, axis=-1) 
  specs.append(tf.image.resize(spec,[spec_h, spec_w]))
  
  sys.stderr.flush()

# Store the spectrograms in a numpy array
specs = np.asarray(specs, dtype=np.float16)
waveforms = np.asarray(waveforms)

100%|██████████| 12/12 [00:00<00:00, 50.61it/s]


## Predict the Valence-Arousal values for each segment

In [None]:
if vggish:

  # Load the pretrained vggish model.
  import tensorflow_hub as hub
  vggish_embeddings_model = hub.load('https://tfhub.dev/google/vggish/1')

  # Compute the embeddings using the pretrained vggish model.
  vggish_features = []
  for waveform in waveforms:
    vggish_features.append(np.expand_dims(vggish_embeddings_model(waveform), axis=-1))
  vggish_features = np.array(vggish_features)

  # predict the V-A values using the VGGish trained model
  preds = model.predict(vggish_features)

else:
  preds = model.predict(specs)

Write the Valence-Arousal values in a .json file.

In [None]:
!mkdir to_processing

mkdir: cannot create directory ‘to_processing’: File exists


In [None]:
import json

with open('to_processing/predictions.json', 'w') as myfile:
    json.dump(preds.tolist(), myfile)

## Load the ImageNet classes Valence-Arousal mapping

In [None]:
!wget --no-check-certificate -r "https://drive.google.com/uc?export=download&id=1CMnriEqxTi_HQXqbYkmbBOsOadIHthP3" -O "VA_ImageNet.csv.zip"
!unzip "/content/VA_ImageNet.csv.zip"
brm_df = pd.read_csv('/content/imagenet-classes-v-a_scaled.csv')

will be placed in the single file you specified.

--2021-09-08 12:00:20--  https://drive.google.com/uc?export=download&id=1CMnriEqxTi_HQXqbYkmbBOsOadIHthP3
Resolving drive.google.com (drive.google.com)... 74.125.201.101, 74.125.201.138, 74.125.201.139, ...
Connecting to drive.google.com (drive.google.com)|74.125.201.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0c-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6jlqh22qjvbv44teml4qcna6v7h6rgvi/1631102400000/17562262531043783088/*/1CMnriEqxTi_HQXqbYkmbBOsOadIHthP3?e=download [following]
--2021-09-08 12:00:21--  https://doc-0c-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6jlqh22qjvbv44teml4qcna6v7h6rgvi/1631102400000/17562262531043783088/*/1CMnriEqxTi_HQXqbYkmbBOsOadIHthP3?e=download
Resolving doc-0c-9c-docs.googleusercontent.com (doc-0c-9c-docs.googleusercontent.com)... 142.250.152.132, 2607:f8b0:4001:c56::84
Connect

Search for the classes that correspond to the Valence Arousal of each segment spectrogram

In [None]:
brm_df_drop = brm_df.drop(columns=['Id','Class', 'WordBRM'])

brm_arr = brm_df_drop.to_numpy()


def find_classes(valence, arousal, length, np_dataset, df_dataset):
    tree = sp.spatial.KDTree(np_dataset)
    distance, brm_indices = tree.query([valence, arousal], length)
    return df_dataset.values[brm_indices]

## Configure the Visualizer

In [None]:
#@markdown Select the **resolution** of the images to be generated
resolution = 512 #@param [128, 256, 512]

#@markdown The **frame length** controls the number of audio frames per video frame in the output.
#@markdown If you want a higher frame rate for visualizing very rapid music, lower the frame_length   
#@markdown (Range: Multiples of 2^6).
frame_length = 512 #@param {type:"slider", min:64, max:1024, step:64}


batch_size = 15 # Set batch size
# BigGAN generates the images in batches of size [batch_size]. 
# The only reason to reduce batch size from the default of 30 is if you run out of memory on a GPU. 
# Reducing the batch size will slightly increase overall runtime.


if duration:
    seconds = duration
    frame_lim = int(np.floor(seconds*22050/frame_length/batch_size))
else:
    frame_lim = int(np.floor(len(audio)/sample_rate*22050/frame_length/batch_size))


#@markdown The **truncation** trick provides a trade-off between image quality or fidelity and image variety.
#@markdown A more narrow sampling range results in better quality, whereas a larger sampling range results in more variety in sampled images
#@markdown (scalar truncation value in [0.0, 1.0]).

#@markdown The truncation trick involves using a different distribution for the generator’s 
#@markdown latent space during training than during inference or image synthesis.
#@markdown A Gaussian distribution is used during training, and a truncated Gaussian is 
#@markdown used during inference. This is referred to as the “truncation trick.”
#@markdown We call this the Truncation Trick: truncating a z vector by resampling the values
#@markdown with magnitude above a chosen threshold leads to improvement in individual sample
#@markdown quality at the cost of reduction in overall sample variety.

#@markdown The truncation controls the variability of images that BigGAN generates by
#@markdown limiting the max values in the noise vector. 
#@markdown Truncations closer to 1 yield more variable images, and truncations closer to 
#@markdown 0 yield simpler images with more recognizable, normal-looking objects.
truncation = 1  #@param {type:"slider", min:0.0, max:1.0, step:0.1}

#@markdown The **pitch sensitivity** controls how rapidly the class vector (thematic content of the video)
#@markdown will react to changes in pitch. The higher the number, the higher the sensitivity.
#@markdown Recommended Range: [200, 295]
pitch_sensitivity = 220 #@param {type:"slider", min:1, max:299, step:1}

pitch_sensitivity = (300 - pitch_sensitivity) * 512 / frame_length

#@markdown The **tempo sensitivity** controls how rapidly the noise vector
#@markdown will react to changes in audio. The higher the number, the higher the sensitivity.
#@markdown (Range: [0.05, 0.8]).
tempo_sensitivity = 0.25 # @param {type:"slider", min:0.05, max:0.8, step:0.05}

tempo_sensitivity = tempo_sensitivity * frame_length / 512

#@markdown The **depth** specifies the max value of the class vector units. Numbers closer to 1 
#@markdown seem to yield more thematically rich content. Numbers closer to 0 seem to yield
#@markdown more 'deep' structures like human and dog faces. However, this depends heavily
#@markdown on the specific classes you are using (Range: [0.01, 1.0])
depth = 1 #@param {type:"slider", min:0.01, max:1.0, step:0.01}

#@markdown Set **number of classes**, Default: Twelve random indices between 0-999.
#@markdown Default is twelve, corresponding to the twelve musical pitches (A, A#, B, etc.)). 
num_classes = 12 # @param {type:"slider", min:1, max:12, step:1}

#@markdown The **jitter** prevents the same exact noise vectors from cycling repetitively during repetitive music
#@markdown so that the video output is more interesting. If you do want to cycle repetitively, set jitter to 0.
#@markdown (Range: [0.0, 1.0])
jitter = 0.8 # @param {type:"slider", min:0.0, max:1.0, step:0.1} 

use_previous_vectors = 0 # Set use_previous_classes
use_previous_classes = 0 # Set use_previous_vectors
    
outname = 'convolvulaceae-visio.mp4' # Set output name

#@markdown After the class vectors have been generated, they are smoothed by interpolating linearly between
#@markdown the means of class vectors in bins of size [smooth_factor]. This is performed because small local 
#@markdown fluctuations in pitch can cause the video frames to fluctuate back and forth. 
#@markdown If you want to visualize very fast music with rapid changes in pitch, you can lower the smooth factor. 
#@markdown You may also want to lower the frame_length in that case. However, for most songs, it is difficult to avoid 
#@markdown rapid fluctuations with smooth factors less than 10.
#@markdown Set smooth factor in Range [1 – 30]
smooth_factor = 1 #@param {type:"slider", min:1, max:30, step:1}

if smooth_factor > 1:
    smooth_factor = int(smooth_factor * 512 / frame_length)
else:
    smooth_factor = smooth_factor


## Adding the Visualizer functions

In [None]:
# Create a truncated noise sample
def truncated_noise_sample(batch_size=1, dim_z=128, truncation=1., seed=None):
    """ Create a truncated noise vector.
        Params:
            batch_size: batch size.
            dim_z: dimension of z
            truncation: truncation value to use
            seed: seed for the random generator
        Output:
            array of shape (batch_size, dim_z)
    """
    state = None if seed is None else np.random.RandomState(seed)
    values = truncnorm.rvs(-2, 2, size=(batch_size, dim_z), random_state=state).astype(np.float32)
    return (truncation * values).astype(np.float32)


def convert_to_images(obj):
    """ Convert an output tensor from BigGAN in a list of images.
        Params:
            obj: tensor or numpy array of shape (batch_size, channels, height, width)
        Output:
            list of Pillow Images of size (height, width)
    """
    try:
        import PIL
    except ImportError:
        raise ImportError("Please install Pillow to use images: pip install Pillow")

    if not isinstance(obj, np.ndarray):
        obj = obj.detach.numpy()

    img = []
    for i, out in enumerate(obj):
        out_array = np.asarray(np.uint8(out), dtype=np.uint8)
        img.append(PIL.Image.fromarray(out_array))
    return img


def predict(sess, noise, label, truncation=1.):
  noise = np.asarray(noise)
  label = np.asarray(label)
  num = noise.shape[0]
  feed_dict = {input_z: noise, input_y: label, input_trunc: truncation}
  ims = sess.run(output, feed_dict=feed_dict)
  ims = np.clip(((ims + 1) / 2.0) * 256, 0, 255)
  ims = np.uint8(ims)
  return ims

def imshow(a, format='png', jpeg_fallback=True):
  a = np.asarray(a, dtype=np.uint8)
  data = io.BytesIO()
  PIL.Image.fromarray(a).save(data, format)
  im_data = data.getvalue()
  try:
    disp = IPython.display.display(IPython.display.Image(im_data))
  except IOError:
    if jpeg_fallback and format != 'jpeg':
      print(('Warning: image was too large to display in format "{}"; '
             'trying jpeg instead.').format(format))
      return imshow(a, format='jpeg')
    else:
      raise
  return disp

def imgrid(imarray, cols=5, pad=1):
  if imarray.dtype != np.uint8:
    raise ValueError('imgrid input imarray must be uint8')
  pad = int(pad)
  assert pad >= 0
  cols = int(cols)
  assert cols >= 1
  N, H, W, C = imarray.shape
  rows = N // cols + int(N % cols != 0)
  batch_pad = rows * cols - N
  assert batch_pad >= 0
  post_pad = [batch_pad, pad, pad, 0]
  pad_arg = [[0, p] for p in post_pad]
  imarray = np.pad(imarray, pad_arg, 'constant', constant_values=255)
  H += pad
  W += pad
  grid = (imarray
          .reshape(rows, cols, H, W, C)
          .transpose(0, 2, 1, 3, 4)
          .reshape(rows*H, cols*W, C))
  if pad:
    grid = grid[:-pad, :-pad]
  return grid

# Get new jitters
def new_jitters(jitter):
  jitters = np.zeros(128)
  for j in range(128):
      if random.uniform(0, 1) < 0.5:
          jitters[j] = 1
      else:
          jitters[j] = 1-jitter
  return jitters


# Get new update directions
def new_update_dir(nv2, update_dir):
  for ni, n in enumerate(nv2):
      if n >= 2*truncation - tempo_sensitivity:
          update_dir[ni] = -1  
                      
      elif n < -2*truncation + tempo_sensitivity:
          update_dir[ni] = 1   
  return update_dir


# Smooth class vectors
def smooth(class_vectors, smooth_factor):
  if smooth_factor == 1:
      return np.array(class_vectors)
  
  class_vectors_terp = []
  for c in range(int(np.floor(len(class_vectors)/smooth_factor) - 1)):
      ci = c*smooth_factor
      cva = np.mean(class_vectors[int(ci):int(ci)+smooth_factor], axis=0)
      cvb = np.mean(class_vectors[int(ci)+smooth_factor:int(ci)+smooth_factor*2], axis=0)
                  
      for j in range(smooth_factor):                                 
          cvc = cva*(1 - j/(smooth_factor - 1)) + cvb*(j/(smooth_factor - 1))
          class_vectors_terp.append(cvc)
          
  return np.array(class_vectors_terp)


# Normalize class vector between 0-1
def normalize_cv(cv2):
  min_class_val = min(i for i in cv2 if i != 0)
  for ci, c in enumerate(cv2):
      if c == 0:
          cv2[ci] = min_class_val
  cv2 = (cv2-min_class_val)/np.ptp(cv2)
  
  return cv2


## Compute the audio mel spectrogram and chromagram. 
 - The mel spectrogram yields the mean power and its gradient is used to update the noise vectors.
 - The chromagram yields the power corresponding to each of the 12 pitches and its used to initialize the class vectors.

In [None]:
# Create spectrogram
spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128, fmax=8000, hop_length=frame_length)

# Get mean power at each time point
specm = np.mean(spec, axis=0)

# Compute power gradient across time points
gradm = np.gradient(specm)

# Set max to 1
gradm = gradm/np.max(gradm)

# Set negative gradient time points to zero
gradm = gradm.clip(min=0)
    
# Normalize mean power between 0-1
specm = (specm-np.min(specm))/np.ptp(specm)

# Create chromagram of pitches X time points
chroma = librosa.feature.chroma_cqt(y=audio, sr=sample_rate, hop_length=frame_length)

# Sort pitches by overall power
chromasort = np.argsort(np.mean(chroma, axis=1))[::-1]

In [None]:
classes = np.zeros((len(gradm),num_classes))

prediction_index = -1 
for frame_index, grad in enumerate(tqdm(range(len(gradm)))): 
  if frame_index % np.ceil((5*sample_rate)/frame_length) == 0: 
    prediction_index = prediction_index + 1 
  classes[frame_index][:] = find_classes(preds[prediction_index, 0], preds[prediction_index, 1], num_classes, brm_arr, brm_df)[:,0]
    
  sys.stderr.flush()

# Initialize first class vector
cv1 = np.zeros(1000,dtype=np.float32)
for pi, p in enumerate(chromasort[:num_classes]):
  if num_classes < 12:
    cv1[int(classes[0][pi])] = chroma[p][np.min([np.where(chrow > 0)[0][0] for chrow in chroma])]
  else:
    cv1[int(classes[0][p])] = chroma[p][np.min([np.where(chrow > 0)[0][0] for chrow in chroma])]

# Initialize first noise vector
nv1 = truncated_noise_sample(truncation=truncation)[0]

# Initialize list of class and noise vectors
class_vectors = [cv1]
noise_vectors = [nv1]

# Initialize previous vectors (will be used to track the previous frame)
cvlast = cv1
nvlast = nv1

# Initialize the direction of noise vector unit updates
update_dir = np.zeros(128)
for ni, n in enumerate(nv1):
  if n < 0:
    update_dir[ni] = 1
  else:
    update_dir[ni] = -1

# initialize noise unit update
update_last = np.zeros(128)

100%|██████████| 2584/2584 [00:32<00:00, 80.52it/s]


Generating input vectors

In [None]:
for i in tqdm(range(len(gradm))):   
    
  # Print progress
  pass
  
  # Update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
  if i % 200 == 0:
      jitters = new_jitters(jitter)
  
  # Get last noise vector
  nv1 = nvlast
  
  # Set noise vector update based on direction, sensitivity, jitter, and combination of
  # overall power and gradient of power
  update = np.array([tempo_sensitivity for k in range(128)]) * (gradm[i]+specm[i]) * update_dir * jitters 
  
  # Smooth the update with the previous update (to avoid overly sharp frame transitions)
  update = (update+update_last * 3) / 4
  
  # Set last update
  update_last = update
      
  # Update noise vector
  nv2 = nv1+update
  
  # Append to noise vectors
  noise_vectors.append(nv2)
  
  # Set last noise vector
  nvlast = nv2
                 
  # Update the direction of noise units
  update_dir = new_update_dir(nv2, update_dir)
  
  # Get last class vector
  cv1 = cvlast
  
  # Generate new class vector
  cv2 = np.zeros(1000)
  for j in range(num_classes):
    cv2[int(classes[i][j])] = (cvlast[int(classes[i][j])] + ((chroma[chromasort[j]][i])/pitch_sensitivity))/(1 + (1/pitch_sensitivity))
  
  # If more than 6 classes, normalize new class vector between 0 and 1, else simply set max class val to 1
  if num_classes > 6:
    cv2 = normalize_cv(cv2)
  else:
    cv2 = cv2/np.max(cv2)
  
  # Adjust depth
  cv2 = cv2*depth
  
  # This prevents rare bugs where all classes are the same value
  if np.std(cv2[np.where(cv2 != 0)]) < 0.0000001:
    cv2[int(classes[i][0])] = cv2[int(classes[i][0])] + 0.01  
  
  # Append new class vector
  class_vectors.append(cv2)

  # Set last class vector
  cvlast = cv2


# Interpolate between class vectors of bin size [smooth_factor] to smooth frames
class_vectors = smooth(class_vectors, smooth_factor)


# Check whether to use vectors from last run
if use_previous_vectors == 1:
    # Load vectors from previous run
    class_vectors = np.load('class_vectors.npy')
    noise_vectors = np.load('noise_vectors.npy')
else:
    # Save record of vectors for current video
    np.save('class_vectors.npy', class_vectors)
    np.save('noise_vectors.npy', noise_vectors)



100%|██████████| 2584/2584 [00:03<00:00, 834.56it/s]


Write the classes with their Valence and Arousal into a .json file to be used for Processing visualization.

In [None]:
classes_unique = []
class_va = dict()

for frame_index, class_vector in enumerate(classes):
    if frame_index % np.ceil((5*sample_rate)/frame_length) == 0:
        for cl in class_vector:
            class_va[int(cl)] = brm_df.loc[brm_df['Id'] == int(cl), "Class":"A.Mean.Sum"].values[0].tolist()
        classes_unique.append(class_va)
    class_va = dict()

import json

with open('to_processing/classes_va.json', 'w') as myfile:
    json.dump(classes_unique, myfile)

Load BigGAN generator module from TF Hub

In [None]:
import tensorflow.compat.v1 as tf1
tf1.disable_v2_behavior()
import tensorflow_hub as hub

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
module_path = 'https://tfhub.dev/deepmind/biggan-deep-' + str(resolution) + '/1'
print('Loading BigGAN module from:', module_path)

tf1.reset_default_graph()
module = hub.Module(module_path)

inputs = {k: tf1.placeholder(v.dtype, v.get_shape().as_list(), k)
          for k, v in module.get_input_info_dict().items()}
output = module(inputs)

input_z = inputs['z']
input_y = inputs['y']
input_trunc = inputs['truncation']

Loading BigGAN module from: https://tfhub.dev/deepmind/biggan-deep-512/1
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Create a TensorFlow session and initialize variables

In [None]:
initializer = tf1.global_variables_initializer()
sess = tf1.Session()
sess.run(initializer)

Generate frames in batches of batch_size

In [None]:
frames = []

for i in tqdm(range(frame_lim)):
    
  # Print progress
  pass
  
  if (i + 1) * batch_size > class_vectors.shape[0]:
      tf1.keras.backend.clear_session()
      break
  
  # Get batch
  noise_vector = np.asarray(noise_vectors[i*batch_size:(i + 1)*batch_size], dtype=np.float32)
  class_vector = np.asarray(class_vectors[i*batch_size:(i + 1)*batch_size], dtype=np.float32)
  # Generate images
  predictions = predict(sess, noise_vector, class_vector, truncation=truncation)

  # Convert to image array and add to frames
  output_images = convert_to_images(predictions)
  for out in output_images:
      im = np.array(out)
      frames.append(im)

# Save video
aud = mpy.AudioFileClip('/content/audiofile.wav', fps=44100)


clip = mpy.ImageSequenceClip(frames, fps=22050/frame_length)
clip = clip.set_audio(aud)
clip.write_videofile(outname, audio_codec='aac')

100%|██████████| 172/172 [06:02<00:00,  2.11s/it]
  result = np.fromstring(s, dtype=dt)



[MoviePy] >>>> Building video convolvulaceae-visio.mp4
[MoviePy] Writing audio in convolvulaceae-visioTEMP_MPY_wvf_snd.mp4


100%|██████████| 1324/1324 [00:02<00:00, 562.19it/s]

[MoviePy] Done.
[MoviePy] Writing video convolvulaceae-visio.mp4



100%|██████████| 2580/2580 [01:21<00:00, 31.81it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: convolvulaceae-visio.mp4 



Download Files to be used for Processing visualization.

In [None]:
!zip /content/convolvulaceae-visio.zip /content/convolvulaceae-visio.mp4
!zip -r /content/to_processing.zip /content/to_processing


from google.colab import files
files.download("/content/convolvulaceae-visio.zip")
files.download("/content/to_processing.zip")
files.download("/content/audiofile.wav")

updating: content/convolvulaceae-visio.mp4 (deflated 0%)
updating: content/to_processing/ (stored 0%)
updating: content/to_processing/predictions.json (deflated 52%)
updating: content/to_processing/classes_va.json (deflated 88%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>