In [None]:
# install librosa if not yet installed
# !pip install librosa
# !pip install matplotlib

In [1]:
#imports
import pandas as pd
import glob
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder

dataset_path = os.path.join(os.getcwd(),'..\cmu-mosi\Custom\Audio')

data_array = []
for name in glob.glob(dataset_path+'\*\*'):
    
    label = os.path.basename(os.path.dirname(name))
    
    data_array.append([
        name,
        label
    ])
    
df = pd.DataFrame(data_array, columns=['path','label'])


In [None]:
# install required modules
# !pip install resampy
# !pip install tf-slim

In [None]:
# !git clone https://github.com/tensorflow/models.git

In [None]:
# !curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt
# !curl -O https://storage.googleapis.com/audioset/vggish_pca_params.npz

In [None]:
# Copy the source files to the current directory.
# !cp models/research/audioset/vggish/* .
# !xcopy .\models\research\audioset\vggish\ .

In [None]:
# run smoke test
# from vggish_smoke_test import *

In [2]:
import vggish_slim
import vggish_params
import vggish_input
import tensorflow.compat.v1 as tf
import vggish_postprocess
import time
def CreateVGGishNetwork(hop_size=0.96):   # Hop size is in seconds.
  ###
  # Define VGGish model, load the checkpoint, and return a dictionary that points
  # to the different tensors defined by the model.
  ###
  vggish_slim.define_vggish_slim()
  checkpoint_path = 'vggish_model.ckpt'
  vggish_params.EXAMPLE_HOP_SECONDS = hop_size
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)
  layers = {'conv1': 'vggish/conv1/Relu',
            'pool1': 'vggish/pool1/MaxPool',
            'conv2': 'vggish/conv2/Relu',
            'pool2': 'vggish/pool2/MaxPool',
            'conv3': 'vggish/conv3/conv3_2/Relu',
            'pool3': 'vggish/pool3/MaxPool',
            'conv4': 'vggish/conv4/conv4_2/Relu',
            'pool4': 'vggish/pool4/MaxPool',
            'fc1': 'vggish/fc1/fc1_2/Relu',
#             'fc2': 'vggish/fc2/Relu',
            'embedding': 'vggish/embedding',
            'features': 'vggish/input_features',
         }
  g = tf.get_default_graph()
  for k in layers:
    layers[k] = g.get_tensor_by_name( layers[k] + ':0')
  return {'features': features_tensor,
          'embedding': embedding_tensor,
          'layers': layers,
         }

In [3]:
def ProcessWithVGGish(vgg, x, sr):
  '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''
  # Produce a batch of log mel spectrogram examples.
  input_batch = vggish_input.waveform_to_examples(x, sr)
  # print('Log Mel Spectrogram example: ', input_batch[0])
  [embedding_batch] = sess.run([vgg['embedding']],
                               feed_dict={vgg['features']: input_batch})
  # Postprocess the results to produce whitened quantized embeddings.
  pca_params_path = 'vggish_pca_params.npz'
  pproc = vggish_postprocess.Postprocessor(pca_params_path)
  postprocessed_batch = pproc.postprocess(embedding_batch)
  # print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
  return postprocessed_batch[0]


In [4]:
tf.reset_default_graph()
sess = tf.Session()
tf.compat.v1.disable_eager_execution()
# now we extract features using VGGish, and feed it back to our custom neural network layer
# data_frame =pd.DataFrame({"lists": list(audio_arr)},columns=['lists'])
vgg = CreateVGGishNetwork(0.01)

# load audio files
audio_arr = []

start = time.time()
count = 1
for index, row in df.iterrows():
    if index % 100 == 0:
        print("{} minutes has passed, index is {}".format(int(time.time() - start)/60, index))
        count+=1
    try:
        y, sr = librosa.load(row['path'], sr=44100)
        data = ProcessWithVGGish(vgg,y,44100)
        audio_arr.append(data)
    except:
        print('FAILED: ',row['path'])
        df.drop(index)
    

# data = FeaturesFromVGGish(vgg,data_frame.iloc[0]['lists'],44100)


INFO:tensorflow:Restoring parameters from vggish_model.ckpt




0.0 minutes has passed, index is 0
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\Custom\Audio\negative\0h-zjBukYpk_12.wav
2.7666666666666666 minutes has passed, index is 100
5.45 minutes has passed, index is 200
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\Custom\Audio\negative\BvYR0L6f2Ig_22.wav
7.683333333333334 minutes has passed, index is 300
10.266666666666667 minutes has passed, index is 400
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\Custom\Audio\negative\d6hH302o4v8_16.wav
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\Custom\Audio\negative\d6hH302o4v8_40.wav
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\Custom\Audio\negative\d6hH302o4v8_5.wav
12.716666666666667 minutes has passed, index is 500
FAILED:  C:\Users\User\Documents\School\Term 6\CDS\Project\speech-emotion\..\cmu-mosi\

In [None]:
from tensorflow import keras

x = keras.preprocessing.sequence.pad_sequences(
    audio_arr, padding="post"
)
le = LabelEncoder()
y = to_categorical(le.fit_transform(np.array(df['label'].tolist())))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)


In [None]:
from tensorflow import keras
model = keras.Sequential()
model.add(layers.Flatten())
model.add(layers.Dense(3))
model.add(layers.Activation('softmax'))

In [None]:
num_epochs = 30
num_batch_size = 32
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.fit(x_train, y_train, epochs=num_epochs, batch_size=num_batch_size, verbose=1)



In [None]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: {0:.2%}".format(score[1]))

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: {0:.2%}".format(score[1]))
