## Prepare SALVE data for training. 

In [2]:
import os
import pandas as pd
import numpy as np
import soundfile as sf
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import src.salve as salve
import pathlib
import src.soundscape_analysis as ssa
from concurrent.futures import ThreadPoolExecutor

DATASET_DIR = pathlib.Path("../data/SALVE001")
TABLES_DIR = pathlib.Path("../data/tables")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Looking into the raw audio recordings, there is a clear DC offset, which might potenitally lead to misleading analysis results and may even cause issues while training the models. Therefore we will remove it by subtracting the mean of each file.

Also, the [recommendations for training a RAVE model](https://forum.ircam.fr/article/detail/training-rave-models-on-custom-data/) suggest to normalize low amplitude data, which we will do. To stay consistent within the dataset we will not normalize each recording individually, but instead normalize it with reference to the max amplitude within the whole dataset.

In [28]:
# Use an example audio file from the raw data to visualise the DC offset

y, sr = sf.read(DATASET_DIR / "raw" / "S4A08684_20190508_071500.wav")
# Downsample for visualisation
y = y[::1000]
fig = px.line()
fig.add_scatter(x=np.linspace(0, len(y) / (sr // 1000), len(y)),y=y, mode="lines", showlegend=False)
fig.add_scatter(x=[0, len(y) / (sr // 1000)], y=[np.mean(y)] * 2, mode="lines", showlegend=False)

fig.show()


In [39]:
def remove_dc_offset_and_calculate_absmax(filename):
    y, sr = sf.read(filename)
    y = y - np.mean(y) # Remove DC offset
    sf.write(DATASET_DIR / filename.name, data=y, samplerate=sr)
    return np.max(np.abs(y)) # Calculate abs max
    
    
raw_files = list((DATASET_DIR / "raw").glob("*.wav"))
with ThreadPoolExecutor(8) as pool:
    result = list(pool.map(remove_dc_offset_and_calculate_absmax, raw_files))
    
# After removing the DDC offset it may be, that clipings in the raw audio could result 
# in max values bigger than 1. However we want to ignore those cases for the normalization,
# as we can consider the information of those samples as lost information anyways and we
# don't want the overall amplitude to be reduced additionally.
total_max = min(np.max(result), 1)
print("Maximal absolute amplitude of the full dataset:", total_max)


def normalize_with_total_max(filename):
    y, sr = sf.read(filename)
    y = y / total_max # Normalize
    # Override pervious file with normalized one
    sf.write(filename, data=y, samplerate=sr) 
    
files = DATASET_DIR.glob("*.wav")
with ThreadPoolExecutor(8) as pool:
    pool.map(normalize_with_total_max, files)

Maximal absolute amplitude of the full dataset: 1


In [40]:
# Visualise previous example but normalised and with removed DC offset

y, sr = sf.read(DATASET_DIR / "S4A08684_20190508_071500.wav")
# Downsample for visualisation
y = y[::1000]
fig = px.line()
fig.add_scatter(x=np.linspace(0, len(y) / (sr // 1000), len(y)),y=y, mode="lines", showlegend=False)
fig.add_scatter(x=[0, len(y) / (sr // 1000)], y=[np.mean(y)] * 2, mode="lines", showlegend=False)

fig.show()


Now we extract some meta data from the audio files. For the SALVE dataset some information like the datetime can be derived from the audio file names, which will be referred as the Recording_IDs. Other information like samplerate can be derived from the file headers. 

In [41]:
# Create initial dataframe based on dataset.
df = salve.read_metadata_from_directory(DATASET_DIR)
os.makedirs(TABLES_DIR, exist_ok=True)
df.to_json(TABLES_DIR / "SALVE001_meta.json", orient="records", default_handler=str)

In [42]:
# Extend dataframe with some meta information, that requires a read of the file header

with ThreadPoolExecutor(8) as pool:
    result = pool.map(ssa.get_file_info, df["Path"])
    
df_meta = pd.DataFrame(result)
df = df.set_index("Recording_ID").join(df_meta.set_index("Recording_ID"))


In [43]:
# Extract date and day hours from recording datetimes
df["Date"] = pd.to_datetime(
    df["Datetime"], format=salve.SALVE_DATETIME_FORMAT
).dt.date

df["Dayhour"] = pd.to_datetime(
    df["Datetime"], format=salve.SALVE_DATETIME_FORMAT
).dt.hour

In [44]:
# Calculate bioacoustic indices

with ThreadPoolExecutor(8) as pool:
    result = pool.map(ssa.get_bioindex_suite, df["Path"])

df_indices = pd.DataFrame(result)
df_indices.to_json(TABLES_DIR / "SALVE001_BAI.json", orient="records", default_handler=str)
df_indices


Unnamed: 0,Recording_ID,NP,Bio,NDSI,ACI,ADI,AEI,Ht,Hf,H
0,S4A09093_20190507_074400.wav,7,61.035393,-0.930256,296.593855,1.792245,0.774731,0.959884,0.283144,0.271785
1,S4A09106_20190508_192000.wav,8,12.353062,-0.883136,296.307642,2.981320,0.091794,0.982501,0.106528,0.104664
2,S4A09093_20190506_162600.wav,6,32.592310,-0.939901,295.662628,1.174805,0.878583,0.881205,0.275963,0.243180
3,S4A09106_20190507_100900.wav,31,118.970918,-0.989708,366.805988,0.191948,0.965323,0.799960,0.462418,0.369916
4,S4A08684_20190507_175300.wav,20,17.736899,-0.985142,297.557323,0.258188,0.958015,0.854084,0.356205,0.304229
...,...,...,...,...,...,...,...,...,...,...
395,S4A08684_20190507_234100.wav,6,3.606127,-0.832509,295.943092,2.981106,0.078974,0.957387,0.055460,0.053096
396,S4A09106_20190508_140100.wav,11,60.419587,-0.599987,304.011092,2.331363,0.577582,0.967990,0.145403,0.140749
397,S4A09093_20190507_224300.wav,15,66.796376,-0.856710,302.395515,1.807190,0.767718,0.911650,0.239363,0.218215
398,S4A08697_20190507_054800.wav,17,39.851829,-0.428520,299.365315,1.978506,0.723382,0.970530,0.148787,0.144403


As the SALVE dataset is not a prepared benchmark dataset, we need to split it into train and test data. As a sample strategy we assume, that the quality of soundscapes differs based on daytimes as well as on the recording locations. (We may even want to consider the weekend days as special case as there might be less traffic noise in some locations compared to workdays. However, in the current version this is not part of the sampling strategy.) 

For the following we can assume that different recording locations are represented by different Device_IDs. This is true for the dataset used in this example but may not be true for other datasets.

In [45]:
BIOACOUSTICS_INDEX_LABELS = ("NP", "Bio", "NDSI", "ACI", "ADI", "AEI", "Ht", "Hf", "H")

# Merge meta informations and calculated indices in a single table
df_joined = df.join(df_indices.set_index("Recording_ID")).sort_values("Datetime")

# Plot bioacoustic indices over Dayhours for a single location
device_id = "S4A08697"
df_single_location = df_joined.loc[df_joined["Device_ID"] == device_id]
fig = make_subplots(rows=9, cols=1, horizontal_spacing=0.01, vertical_spacing=0.02)
for i, index in enumerate(BIOACOUSTICS_INDEX_LABELS):
    fig.add_trace(
        go.Box(x=df_single_location["Dayhour"], y=df_single_location[index], name=index),
        row=i + 1,
        col=1
    )
fig.update_layout(height=1000, title="Bioacoustic Indices over Dayhours (single location)")
fig.show()

# Plot ADI as an example for differences between locations
fig2 = px.box(df_joined, x="Device_ID", y="ADI")
fig2.update_layout(title="Acoustic Diversity Index (ADI) for different recording locations.")
fig2.show()

As the plots show, the assumption, that the quality of soundscapes might vary depending on daytime as well as on recording location seems to be right at least for some indices. 

Therefore we want to sample test data equally distributed over daytime and locations. To achieve a good estimation, we will form daytime categories that represent different acoustic daytime qualities:
1. night (21:00 - 02:59)
2. morning (03:00 - 8:59)
3. noon (09:00 - 14:59)
4. evening (15:00 - 20:59)

In [46]:
# Add daytime categories to table

def daytime_category(dayhour):
    if 3 < dayhour < 9:
        return "morning"
    elif 9 < dayhour < 15:
        return "noon"
    elif 15 < dayhour < 21:
        return "evening"
    else:
        return "night"
    

df["Daytime_cat"] = df_joined["Dayhour"].apply(daytime_category)
df

Unnamed: 0_level_0,Device_ID,Datetime,Path,Channels,Samplerate,Length_sec,Date,Dayhour,Daytime_cat
Recording_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S4A09093_20190507_074400.wav,S4A09093,20190507_074400,../data/SALVE001/S4A09093_20190507_074400.wav,1,44100,179.989478,2019-05-07,7,morning
S4A09106_20190508_192000.wav,S4A09106,20190508_192000,../data/SALVE001/S4A09106_20190508_192000.wav,1,44100,179.989478,2019-05-08,19,evening
S4A09093_20190506_162600.wav,S4A09093,20190506_162600,../data/SALVE001/S4A09093_20190506_162600.wav,1,44100,179.989478,2019-05-06,16,evening
S4A09106_20190507_100900.wav,S4A09106,20190507_100900,../data/SALVE001/S4A09106_20190507_100900.wav,1,44100,179.989478,2019-05-07,10,noon
S4A08684_20190507_175300.wav,S4A08684,20190507_175300,../data/SALVE001/S4A08684_20190507_175300.wav,1,44100,179.989478,2019-05-07,17,evening
...,...,...,...,...,...,...,...,...,...
S4A08684_20190507_234100.wav,S4A08684,20190507_234100,../data/SALVE001/S4A08684_20190507_234100.wav,1,44100,179.989478,2019-05-07,23,night
S4A09106_20190508_140100.wav,S4A09106,20190508_140100,../data/SALVE001/S4A09106_20190508_140100.wav,1,44100,179.989478,2019-05-08,14,noon
S4A09093_20190507_224300.wav,S4A09093,20190507_224300,../data/SALVE001/S4A09093_20190507_224300.wav,1,44100,179.989478,2019-05-07,22,night
S4A08697_20190507_054800.wav,S4A08697,20190507_054800,../data/SALVE001/S4A08697_20190507_054800.wav,1,44100,179.989478,2019-05-07,5,morning


In [49]:
# Mark one audio file per location per datetime category for each date as testdata
np.random.seed(1)
df["Is_testdata"] = False

# Group by location which is in this case represented by the Device_ID
for device, df_location in df.groupby("Device_ID"):
    # Additionally group by Date
    for date, df_date in df_location.groupby("Date"):
        # For each date and each location sample one recording per daytime category
        for daytime_cat, df_daytime in df_date.groupby("Daytime_cat"):
            df.loc[df_daytime.sample(2, replace=True).index, "Is_testdata"] = True
            
# Compute the percentage of test data
testdata_n = len(df[df["Is_testdata"]])
print(f"{testdata_n}/{len(df)} files were marked as testdata.")
print(f"That is {testdata_n / len(df) * 100}% of the total data.")


75/400 files were marked as testdata.
That is 18.75% of the total data.


In [50]:
fig = px.scatter_3d(df, x="Date", y="Dayhour", z="Device_ID", color="Is_testdata")
fig.update_layout(title="Test data sampling")
fig.show()

Now we need to separate the actual data based on the markers in the metadata, because the RAVE CLI requires a directory as test data input.

In [51]:
import os
import shutil

# Create separate train and test folders
train_path = DATASET_DIR / "train"
test_path = DATASET_DIR / "test"
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# Move files based on their Is_testdata flag and update their path in the table
for recording_id, entry in df.iterrows():
    new_path = test_path / recording_id if entry["Is_testdata"] else train_path / recording_id
    shutil.move(entry["Path"], new_path)
    entry["Path"] = new_path
        
# Save updated table
df.to_json(TABLES_DIR / "SALVE001_meta.json", orient="records", default_handler=str)
        

We can now run the RAVE preprocessor on the training data to generate a train dataset.