<a href="https://colab.research.google.com/github/manashpratim/Bosch-Summer-Internship/blob/master/AudioPretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Data is available at https://drive.google.com/drive/folders/1NFYIaXjL8V5kvZo3g9JEafLQ3scslWic?usp=sharing

#Execute all the lines of the notebook sequentially to generate pretrained audio features 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install numpy scipy
!pip install resampy tensorflow six
!pip install tf-slim
!git clone https://github.com/tensorflow/models.git
!curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
!curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz

In [None]:
!ls

In [None]:
!ls models/research/audioset

In [None]:
!cp -r models/research/audioset/* .

In [None]:
!cp /content/vggish/* /content

In [None]:
!ls

In [None]:
from vggish_smoke_test import *

In [None]:
import vggish_slim
import vggish_params
import vggish_input

def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)

  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
            'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.compat.v1.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
    
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [None]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})

  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'

  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]


In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
# Test these new functions with the original test.
import tensorflow as tf
tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.Session()
vgg = CreateVGGishNetwork(0.06)

In [None]:
def EmbeddingsFromVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a dictionary of embeddings from the different layers
  of the model.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])

  layer_names = vgg['layers'].keys()
  tensors = [vgg['layers'][k] for k in layer_names]
  
  results = sess.run(tensors,
                     feed_dict={vgg['features']: input_batch})

  resdict = {}
  for i, k in enumerate(layer_names):
    resdict[k] = results[i]
    
  return resdict

In [None]:
#Unzip the MOSI data
!unzip -q '/content/drive/My Drive/mosi_data/mosi.zip'

In [None]:
#Function to get the audio file name. Arguments are path of the audio files and the name of the save file
def get_file_names(mypath,savefile):
  from os import listdir
  from os.path import isfile, join
  onlyfiles = [f[:f.find('.')] for f in listdir(mypath) if isfile(join(mypath, f))]
  with open(savefile, 'w') as f:
    for item in onlyfiles:
        f.write(item)
        f.write('\n')
  return onlyfiles

In [None]:
mypath = '/content/Raw/Audio/WAV_16000/Segmented' 
savefile = 'audiofiles.txt'

In [None]:
audiofiles = get_file_names(mypath,savefile)              #Get the names of the audio files

In [None]:
from tqdm.notebook import tqdm

#Function to get the pretrained audio features
def get_audio_features(mypath,audiofiles):
    import librosa
    import numpy as np
    mydic = {}
    outer = tqdm(total=len(audiofiles), desc='Extracting...', position=0)
    for file in audiofiles:
      outer.update(1)
      audiofile = mypath + '/' + file + '.wav'
      x, sr = librosa.load(audiofile, sr  = 16000, res_type='kaiser_fast')        #I am extracting frames at 16KHz. 
      try:
        resdict = EmbeddingsFromVGGish(vgg, x, sr)
        mydic[file] = resdict['embedding']
      #Some of the segments are bad. If you use run the notebook without changing any parameters, there will be 28 bad segments out of 2199
      except:                                                           
        mydic[file] = []
        pass
   
    return mydic

In [None]:
audio =  get_audio_features(mypath,audiofiles)              #get the pretrained audio features

In [None]:
# This block changes the format of the audio data to match the other data modalities
files = audiofiles
dic = {}
for file in files:
  new = file[:-file[::-1].find('_')-1]
  dic[file] = new
newdic = {}

newdic1 = {}
for key in audio:
  newkey = dic[key]
  if newkey not in newdic:
    newdic[newkey] = {}
  if newkey not in newdic1:
    newdic1[newkey] = {}
  newdic1[newkey][key] = audio[key]
  k = key[-key[::-1].find('_'):]
  newdic[newkey][int(k)] = audio[key]

nd = {}
for key in newdic:
    nd[key] = []
    for k in sorted(newdic[key].keys()):
      nd[key].append(newdic[key][k])

In [None]:
#Save the pretrained audio features as a pickle file
import pickle
with open('/content/drive/My Drive/mosi_data/audio_pretrained_features_joined.pickle', 'wb') as handle:
    pickle.dump(nd, handle, protocol=pickle.HIGHEST_PROTOCOL)