# VGG16 feature extraction

VGG16 is a convolutional neural network architecture originally trained on the ImageNet dataset for image classification tasks. 
- Although designed for visual inputs, it can be repurposed for audio analysis by converting audio signals into spectrograms—visual representations of sound. 
- By removing the top classification layers, VGG16 can be used as a general-purpose feature extractor for audio spectrograms, capturing rich, hierarchical features useful for downstream audio processing tasks.

In [None]:
import os
import numpy as np
import umap
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import plotly.express as px
from vgg16_functions import extract_image_features_vgg16

In [None]:
## Apply defined function to extract features with VGG16 model from png spectrogram files

# Folder of png spectrograms
spectrogram_dir = "/species_name"

# Create empty list for features array
features = []

# Create empty list for corresponding file names
file_names = []

# Loop through all spectrograms in directory
for file in os.listdir(spectrogram_dir):
    if file.endswith(".png"):
        file_path = os.path.join(spectrogram_dir, file)
        features.append(extract_image_features_vgg16(file_path))
        file_names.append(file)

# Convert to numpy array
features = np.array(features)

In [None]:
# Standardise the features
features_norm = StandardScaler().fit_transform(features)

In [None]:
# Perform UMAP
umap_embedding = umap.UMAP(
    metric='euclidean',
#   n_neighbors=5,
    min_dist=0.2, # Can play around with this parameter
    n_components=2,
    random_state=42
).fit_transform(features_norm)

# Create a DataFrame with UMAP results and corresponding filenames
umap_df = pd.DataFrame(umap_embedding, columns=['UMAP1', 'UMAP2'])
umap_df['filename'] = file_names

In [None]:
### Centroid methodology

# Calculate the centroid (mean) of the points
centroid = umap_df[['UMAP1', 'UMAP2']].mean()

# Calculate the Euclidean distance of each point from the centroid
umap_df['distance_from_centroid'] = ((umap_df['UMAP1'] - centroid[0]) ** 2 + (umap_df['UMAP2'] - centroid[1]) ** 2) ** 0.5

# Visualise the UMAP embedding
plt.figure(figsize=(10, 8))

plt.scatter(umap_df['UMAP1'], umap_df['UMAP2'],
            c=umap_df['distance_from_centroid'], cmap='viridis',
            alpha=0.6, s=25)

plt.colorbar(label='Distance from Centroid')

plt.title('VGG16 UMAP projection with Distance from Centroid (species)')

# Add centroid location
plt.scatter(centroid[0], centroid[1], color='red', s=100, label='Centroid', alpha = 0.8)

# Annotate filenames of outliers (you can choose a threshold based on your data)
outliers_threshold = 6 # Play around with this parameter to remove the further a field data points
outliers = umap_df[umap_df['distance_from_centroid'] > outliers_threshold]

# Annotate outliers on the plot (NOT GOOD FOR IF LARGE SAMPLE SIZE)
#for i, row in outliers.iterrows():
#   plt.annotate(row['filename'], (row['UMAP1'], row['UMAP2']), textcoords="offset points", xytext=(0, 5), ha='center', fontsize=9)

# Plot and print outliers
plt.show()
print("Outliers:", outliers)

In [None]:
### Apply Isolation Forest on the UMAP-reduced features

# Set model params
iso_forest = IsolationForest(contamination=0.05, # Play around with this to add or remove outliers (0.05-0.5)
                             random_state=42,
                             n_estimators = 200)

# Predict on data
outliers = iso_forest.fit_predict(umap_embedding)

# Convert to a DF with UMAP results and outlier labels
umap_df = pd.DataFrame(umap_embedding, columns=['UMAP1', 'UMAP2'])
umap_df['Outlier'] = outliers  # -1 = outlier, 1 = inlier
umap_df['filename'] = file_names

# Visualise
plt.figure(figsize=(10, 8))
plt.scatter(umap_df['UMAP1'], umap_df['UMAP2'], c=umap_df['Outlier'], cmap='coolwarm', alpha=0.6, s=25)
plt.colorbar(label='Outlier (1: Inlier, -1: Outlier)')
plt.title('VGG16 UMAP Projection with Isolation Forest Outliers (species)')
plt.show()

In [None]:
## 3D plotly interactive plot, isolation forest

# 3D UMAP
umap_3d = umap.UMAP(
    metric='euclidean',
    min_dist=0.2,
    n_components=3,
    random_state=42
).fit_transform(features_norm)

# Isolation Forest
iso_forest = IsolationForest(
    contamination=0.05,
    random_state=42,
    n_estimators=200
)
outliers = iso_forest.fit_predict(umap_3d)

# DF
umap_df = pd.DataFrame(umap_3d, columns=['UMAP1', 'UMAP2', 'UMAP3'])
umap_df['Outlier'] = outliers  # -1 = outlier, 1 = inlier
umap_df['filename'] = file_names

# Map outlier df to string
umap_df['Outlier_Label'] = umap_df['Outlier'].map({1: 'Inlier', -1: 'Outlier'})

# Plot
fig = px.scatter_3d(
    umap_df,
    x='UMAP1',
    y='UMAP2',
    z='UMAP3',
    color='Outlier_Label',
    hover_name='filename',
    title='VGG16 3D UMAP Projection with Isolation Forest Outliers (species)',
    color_discrete_map={'Inlier': 'blue', 'Outlier': 'red'}
)

fig.update_traces(marker=dict(size=4, opacity=0.7))
fig.show()