# Mean calculation
Follow VGGish script to extract features from each set of recordings and sites. For each 0.96s an 128 feature embedding will have been extracted. These relate to various features in the spectrogram.

For 120 hours of recordings, ~450,000 rows (0.96s per row).

This script first splits out the date and time into required format.
Then, for each 10 minute window (625 rows) the mean is calculated.

In [None]:
import os
import pandas as pd

# Function to process each file
def process_file(file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Ensure time column is always 6 digits by zero-padding
    df['time'] = df['time'].astype(str).str.zfill(6)
    
    # Convert date and time columns to datetime
    df['datetime'] = pd.to_datetime(df['date'].astype(str) + df['time'].astype(str), format='%Y%m%d%H%M%S', errors='coerce')
    
    # Extract separate date and time columns
    df['date'] = df['datetime'].dt.date.astype(str)
    df['time'] = df['datetime'].dt.time.astype(str)
    
    # Define window size for 10-minute averaging (assuming 0.96s per row -> 625 rows per 10 min)
    n = 625
    
    # Extract SiteID and recording columns separately
    grouped_siteid = df.groupby(df.index // n)['SiteID'].first().reset_index(drop=True)
    grouped_recording = df.groupby(df.index // n)['recording'].first().reset_index(drop=True)
    
    # Drop unwanted columns before averaging
    df_1 = df.drop(['datetime', 'SiteID', 'recording'], axis=1)
    
    # Extract corresponding date and time (first timestamp of each group)
    grouped_dates = df.groupby(df.index // n)['date'].first().reset_index(drop=True)
    grouped_times = df.groupby(df.index // n)['time'].first().reset_index(drop=True)
    
    # Compute mean only on numeric columns
    numeric_cols = df_1.select_dtypes(include=['number']).columns
    mean_values = df_1.groupby(df.index // n)[numeric_cols].mean()
    
    # Add back the date, time, SiteID, and recording columns
    mean_values['date'] = grouped_dates
    mean_values['time'] = grouped_times
    mean_values['SiteID'] = grouped_siteid
    mean_values['recording'] = grouped_recording
    
    # Reset index and return DataFrame
    mean_values = mean_values.reset_index(drop=True)
    return mean_values

# Specify folder paths
folder_path = '/input/'
output_folder = '/output/'
os.makedirs(output_folder, exist_ok=True)

# Iterate through files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):  # Process only CSV files
        file_path = os.path.join(folder_path, filename)
        
        # Process the file
        mean_df = process_file(file_path)
        
        # Clean filename and save mean DataFrame to CSV
        clean_filename = os.path.splitext(filename)[0]  # Remove extension
        mean_df.to_csv(os.path.join(output_folder, f'{clean_filename}_mean.csv'), index=False)