# VGGish implementation

This notebook will take you from raw audio, implement the VGGish model to extract an 128 feature embedding or each 0.96s of audio, then save it.

In [None]:
# Required dependancies for model
from pathlib import Path
import librosa
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

In [None]:
# Extracting features with VGGish, 128 for each 0.96s of respective wav or mp3 file.

# The expected duration and samplerate of input to VGGish
VGGISH_WINDOW_SIZE = 0.96
VGGISH_SR = 16_000

# The samplerate at which the audio is loaded
# Set to None if you don't wish to resample the audio.
SAMPLE_RATE = 16_000

# Directory containing all the audio files to analyze
AUDIO_DIR = Path("File_path")

# Load the model
model = hub.load("https://tfhub.dev/google/vggish/1")

# Get all the WAV files in the AUDIO_DIR
audio_files = AUDIO_DIR.glob("**/*.[wW][aA][vV]")

results = []
for path in audio_files:
    try:
        # load the audio
        waveform, sr = librosa.load(
            str(path),
            # If you specify `sr` it will resample the audio to the provided
            # sampling rate.
            sr=SAMPLE_RATE,
        )
    except (RuntimeError, EOFError):
        # Some audio files might be corrupted. Ignore and go to the next one.
        print(f"The audio file {path} is corrupted")
        continue

    # Run the model on the audio.
    # VGGish will cut longer audios into 0.96 seconds @ 16kHz frames and
    # process each individually.
    embeddings = model(waveform)

    # Each frame should have 128 features. Make sure this is the case.
    embeddings.shape.assert_is_compatible_with([None, 128])

    # If the samplerate is different from the default one the
    # duration of audio that VGGish processes is no longer 0.96 seconds
    # So we need this adjustment.
    window_size = VGGISH_WINDOW_SIZE * (VGGISH_SR / sr)

    # Store info of the embeddings of each frame
    for index, embedding in enumerate(embeddings):
        results.append(
            {
                "recording": path,
                "embeddings": embedding,
                "start_time": window_size * index,
                "end_time": window_size * (index + 1),
            }
        )

In [None]:
# Results as df
results = pd.DataFrame(results)

In [None]:
#Results as a DF
results

In [None]:
#Removing redundant file labels so just features
indices_features = results.drop(['start_time', 'end_time'], axis = 1)

In [None]:
recording = indices_features['recording']
recordings_df = pd.DataFrame(recording)

In [None]:
# Converting embeddings into 128 columns (1 for each feature)
features = np.stack(indices_features.embeddings)
indices_features.loc[:, [f"feat_{n}" for n in range(128)]] = features

In [None]:
df = pd.DataFrame(features)
site_ids = ['Site_id'] * (len(df))
df['SiteID'] = site_ids
df

In [None]:
# Adding the recording ID so AutoCorrelation effects can be seen
merged_df = pd.merge(df, recordings_df, left_index=True, right_index=True)

In [None]:
merged_df

In [None]:
file_path = 'Path_to_file_name.CSV'
merged_df.to_csv(file_path, index=False)