# PCA analysis of VGGish features
Follow VGGish script to extract features from each set of recordings and sites. For each 0.96s an 128 feature embedding will have been extracted. These relate to various features in the spectrogram.

To model this data, dimensionality reduction is required. From prerequisite studies (UMAP vs PCA), UMAP excelled in visualisation. However, performed badly and the model residuals were all over the place. PCA to three components captured the majority of variation, and the model tests were much better aligned.

After you have all the vggish features sorted, with time and date, into separate csv files move onto this notebook.

Here we first combine the VGGish features into one DF, then drop the non-numeric columns, carry out PCA analysis, then add non-numeric columns back on.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Function to read all mean CSV files and combine them into a single DataFrame
def combine_mean_csv_files(folder_path):
    # Create an empty list to store the DataFrames
    dfs = []

    # Iterate through files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('_mean.csv'):
            file_path = os.path.join(folder_path, filename)
            
            # Read the CSV file into a DataFrame
            mean_df = pd.read_csv(file_path)
            
            # Append the DataFrame to the list
            dfs.append(mean_df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# Specify the folder path where the mean CSV files are located
mean_folder_path = '/mean_folder/'

# Combine all mean CSV files into one big DataFrame
big_df = combine_mean_csv_files(mean_folder_path)

# Display the resulting DataFrame
print(big_df)

# Create the output folder to save the files if it doesn't exist
output_folder = '/mean_combined/'
os.makedirs(output_folder, exist_ok=True)

# Save the big DataFrame to a CSV file in the output folder
#output_file_path = os.path.join(output_folder, 'combined_mean_values_10min.csv')
big_df.to_csv(output_file_path, index=False)

In [None]:
# Sort by 'SiteID'
big_df = big_df.sort_values(by='SiteID')

# Drop the specified columns
df_1 = big_df.drop(columns=['date', 'time', 'SiteID', 'recording'])

In [None]:
# Assuming df_1 contains your data

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_1)

# Define the number of components for PCA
n_components = 10

# Create a PCA instance
pca = PCA(n_components=n_components, random_state=42)

# Fit and transform the data using PCA
reduced_data = pca.fit_transform(scaled_data)

# Compute the explained variance ratio
explained_variance = pca.explained_variance_ratio_

# Create a scree plot with customizations
plt.figure(figsize=(8, 6))
plt.plot(range(1, n_components + 1), explained_variance, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Explained Variance Ratio', fontsize=14)
plt.title('Scree Plot of VGGish Principal components analysis', fontsize=14)
plt.xticks(np.arange(1, n_components + 1), fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

plt.savefig('scree_plot.png', dpi=300)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Standardize the data
standardized_data = scaler.fit_transform(df_1)

# Define the number of components for PCA
n_components = 3

# Create a PCA instance
pca = PCA(n_components=n_components, random_state=42)

# Fit and transform the data using PCA
reduced_data = pca.fit_transform(standardized_data)

In [None]:
# Create a new DataFrame with the VGGish PCA outputs, with the additional non-numeric dat
df = pd.DataFrame(reduced_data, columns=['VGGish_1', 'VGGish_2', 'VGGish_3'])
df['Site_code'] = big_df['SiteID'].values
df['Date'] = big_df['date'].values
df['Time'] = big_df['time'].values
df['Recording'] = big_df['recording'].values

In [None]:
# Plotting the data to visualise the relationship
unique_labels = df['Site_code'].unique()
color_map = plt.get_cmap('tab20')
num_colors = len(unique_labels)
colors = color_map(np.linspace(0, 1, num_colors))

for i, label in enumerate(unique_labels):
    label_data = df[df['Site_code'] == label]
    plt.scatter(label_data['VGGish_1'], label_data['VGGish_2'],
                color=colors[i], label=label, s=0.5)
    
#Calculate centroid coordinates for each site
centroids = df.groupby('Site_code')[['VGGish_1', 'VGGish_2']].mean().values

# Plot the centroid markers on top
for i, centroid in enumerate(centroids):
    plt.scatter(centroid[0], centroid[1],
                color=colors[i], marker='o',
                edgecolor='black', linewidth=1, s=50)

# Adding labels and title
plt.xlabel('VGGish_1', size=15)
plt.ylabel('VGGish_2', size=15)

plt.text(0.03, 0.97, 'Site (PCA)', transform=plt.gca().transAxes,
         fontsize=15, verticalalignment='top')


plt.savefig('/Site_PCA.png', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()

In [None]:
# Saving the dataframe
df.to_csv('/PCA_10min.csv', index=False)