## Prepare data for training. 

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import src.salve as salve
import pathlib
import src.soundscape_analysis as ssa
from concurrent.futures import ThreadPoolExecutor

DATASET_DIR = pathlib.Path("../data/SALVE001")
TABLES_DIR = pathlib.Path("../data/tables")

In [2]:
# Create initial dataframe based on dataset.
df = salve.read_metadata_from_directory(DATASET_DIR)
df.to_json(TABLES_DIR / "SALVE001_meta.json", orient="records", default_handler=str)

In [3]:
# Extend dataframe with some meta information, that requires a read of the file header

with ThreadPoolExecutor() as pool:
    result = pool.map(ssa.get_file_info, df["Path"])
    
df_meta = pd.DataFrame(result)
df = df.set_index("Recording_ID").join(df_meta.set_index("Recording_ID"))


In [4]:
# Extract date and day hours from recording datetimes
df["Date"] = pd.to_datetime(
    df["Datetime"], format=salve.SALVE_DATETIME_FORMAT
).dt.date

df["Dayhour"] = pd.to_datetime(
    df["Datetime"], format=salve.SALVE_DATETIME_FORMAT
).dt.hour

In [5]:
# Calculate bioacoustic indices

with ThreadPoolExecutor(4) as pool:
    result = pool.map(ssa.get_bioindex_suite, df["Path"])

df_indices = pd.DataFrame(result)
df_indices.to_json(TABLES_DIR / "SALVE001_BAI.json", orient="records", default_handler=str)
df_indices


Unnamed: 0,Recording_ID,NP,Bio,NDSI,ACI,ADI,AEI,Ht,Hf,H
0,S4A08697_20190506_103800.wav,10,23.779898,-0.943948,297.596900,1.168205,0.880533,0.999503,0.189659,0.189565
1,S4A08684_20190506_175300.wav,3,2.493341,-0.942478,296.083631,0.252353,0.953850,0.999980,0.095842,0.095840
2,S4A09093_20190507_064600.wav,7,84.855497,-0.950118,295.763878,1.175899,0.879668,0.998875,0.278172,0.277860
3,S4A09106_20190507_113600.wav,30,32.061527,-0.805253,328.523564,0.595408,0.929116,0.914141,0.389377,0.355946
4,S4A09106_20190508_133200.wav,12,60.987764,-0.383617,309.005403,2.254969,0.601929,0.999495,0.110658,0.110602
...,...,...,...,...,...,...,...,...,...,...
395,S4A08684_20190506_234100.wav,2,2.667609,-0.910363,295.899920,2.888787,0.198693,0.999988,0.070796,0.070795
396,S4A09106_20190509_035200.wav,14,53.823545,0.125425,298.990579,2.117843,0.665236,0.999794,0.097639,0.097619
397,S4A09093_20190506_162600.wav,6,32.592310,-0.939901,295.662628,1.174805,0.878583,0.999704,0.275963,0.275881
398,S4A08697_20190507_005800.wav,18,31.296283,-0.689004,302.175361,2.443275,0.557261,0.999714,0.164733,0.164686


As the SALVE dataset is not a prepared benchmark dataset, we need to split it into train and test data. As a sample strategy we assume, that the quality of soundscapes differs based on daytimes as well as on the recording locations. (We may even want to consider the weekend days as special case as there might be less traffic noise in some locations compared to workdays. However, in the current version this is not part of the sampling strategy.) 

For the following we can assume that different recording locations are represented by different Device_IDs. This is true for the dataset used in this example but may not be true for other datasets.

In [6]:
BIOACOUSTICS_INDEX_LABELS = ("NP", "Bio", "NDSI", "ACI", "ADI", "AEI", "Ht", "Hf", "H")

# Merge meta informations and calculated indices in a single table
df_joined = df.join(df_indices.set_index("Recording_ID")).sort_values("Datetime")

# Plot bioacoustic indices over Dayhours for a single location
device_id = "S4A08697"
df_single_location = df_joined.loc[df_joined["Device_ID"] == device_id]
fig = make_subplots(rows=9, cols=1, horizontal_spacing=0.01, vertical_spacing=0.02)
for i, index in enumerate(BIOACOUSTICS_INDEX_LABELS):
    fig.add_trace(
        go.Box(x=df_single_location["Dayhour"], y=df_single_location[index], name=index),
        row=i + 1,
        col=1
    )
fig.update_layout(height=1000, title="Bioacoustic Indices over Dayhours (single location)")
fig.show()

# Plot ADI as an example for differences between locations
fig2 = px.box(df_joined, x="Device_ID", y="ADI")
fig2.update_layout(title="Acoustic Diversity Index (ADI) for different recording locations.")
fig2.show()

As the plots show, the assumption, that the quality of soundscapes might vary depending on daytime as well as on recording location seems to be right at least for some indices. 

Therefore we want to sample test data equally distributed over daytime and locations. To achieve a good estimation, we will form daytime categories that represent different acoustic daytime qualities:
1. night (21:00 - 02:59)
2. morning (03:00 - 8:59)
3. noon (09:00 - 14:59)
4. evening (15:00 - 20:59)

In [7]:
# Add daytime categories to table

def daytime_category(dayhour):
    if 3 < dayhour < 9:
        return "morning"
    elif 9 < dayhour < 15:
        return "noon"
    elif 15 < dayhour < 21:
        return "evening"
    else:
        return "night"
    

df["Daytime_cat"] = df_joined["Dayhour"].apply(daytime_category)
df

Unnamed: 0_level_0,Device_ID,Datetime,Path,Channels,Samplerate,Length_sec,Date,Dayhour,Daytime_cat
Recording_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S4A08697_20190506_103800.wav,S4A08697,20190506_103800,../data/SALVE001/S4A08697_20190506_103800.wav,1,44100,179.989478,2019-05-06,10,noon
S4A08684_20190506_175300.wav,S4A08684,20190506_175300,../data/SALVE001/S4A08684_20190506_175300.wav,1,44100,179.989478,2019-05-06,17,evening
S4A09093_20190507_064600.wav,S4A09093,20190507_064600,../data/SALVE001/S4A09093_20190507_064600.wav,1,44100,179.989478,2019-05-07,6,morning
S4A09106_20190507_113600.wav,S4A09106,20190507_113600,../data/SALVE001/S4A09106_20190507_113600.wav,1,44100,179.989478,2019-05-07,11,noon
S4A09106_20190508_133200.wav,S4A09106,20190508_133200,../data/SALVE001/S4A09106_20190508_133200.wav,1,44100,179.989478,2019-05-08,13,noon
...,...,...,...,...,...,...,...,...,...
S4A08684_20190506_234100.wav,S4A08684,20190506_234100,../data/SALVE001/S4A08684_20190506_234100.wav,1,44100,179.989478,2019-05-06,23,night
S4A09106_20190509_035200.wav,S4A09106,20190509_035200,../data/SALVE001/S4A09106_20190509_035200.wav,1,44100,179.989478,2019-05-09,3,night
S4A09093_20190506_162600.wav,S4A09093,20190506_162600,../data/SALVE001/S4A09093_20190506_162600.wav,1,44100,179.989478,2019-05-06,16,evening
S4A08697_20190507_005800.wav,S4A08697,20190507_005800,../data/SALVE001/S4A08697_20190507_005800.wav,1,44100,179.989478,2019-05-07,0,night


In [8]:
# Mark one audio file per location per datetime category for each date as testdata
np.random.seed(1)
df["Is_testdata"] = False

# Group by location which is in this case represented by the Device_ID
for device, df_location in df.groupby("Device_ID"):
    # Additionally group by Date
    for date, df_date in df_location.groupby("Date"):
        # For each date and each location sample one recording per daytime category
        for daytime_cat, df_daytime in df_date.groupby("Daytime_cat"):
            df.loc[df_daytime.sample(2, replace=True).index, "Is_testdata"] = True
            
# Compute the percentage of test data
testdata_n = len(df[df["Is_testdata"]])
print(f"{testdata_n}/{len(df)} files were marked as testdata.")
print(f"That is {testdata_n / len(df) * 100}% of the total data.")


75/400 files were marked as testdata.
That is 18.75% of the total data.


In [9]:
fig = px.scatter_3d(df, x="Date", y="Dayhour", z="Device_ID", color="Is_testdata")
fig.update_layout(title="Test data sampling")
fig.show()

Now we need to separate the actual data based on the markers in the metadata, because the RAVE CLI requires a directory as test data input.

In [15]:
import os
import shutil

# Create separate train and test folders
train_path = DATASET_DIR / "train"
test_path = DATASET_DIR / "test"
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# Move files based on their Is_testdata flag and update their path in the table
for recording_id, entry in df.iterrows():
    new_path = test_path / recording_id if entry["Is_testdata"] else train_path / recording_id
    shutil.move(entry["Path"], new_path)
    entry["Path"] = new_path
        
# Save updated table
df.to_json(TABLES_DIR / "SALVE001_meta.json", orient="records", default_handler=str)
        