In [29]:
from datasets import Dataset, Audio, ClassLabel, Features, load_dataset
import pandas as pd
import numpy as np
import os
import torchaudio
import torch.nn as nn
import torch

In [None]:
# Load metadata CSV (assuming dataset includes a CSV file linking audio to perceptual attributes)
metadata = pd.read_excel(r"C:\Users\pepij\Downloads\noorderplantsoen.xlsx")

metadata["audio_path"] = metadata["GroupID"].apply(lambda x: r"C:\Users\pepij\OneDrive - Delft University of Technology\THESIS\data\WAV_Groningen_1\WAV_Groningen_1\Noorderplantsoen\NP" + x[2:] + ".wav")

# Keep only rows where the file exists
metadata = metadata[metadata["audio_path"].apply(os.path.exists)]

# Reset index after filtering
metadata.reset_index(drop=True, inplace=True)

metadata = metadata[['GroupID', 'pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'audio_path']]

columns_to_convert = [
    "pleasant", "chaotic", "vibrant", "uneventful", 
    "calm", "annoying", "eventful", "monotonous"
]

metadata[columns_to_convert] = metadata[columns_to_convert].astype(float).values

# Before training — scale your labels
means = metadata[columns_to_convert].mean()
stds = metadata[columns_to_convert].std()

metadata[columns_to_convert] = (metadata[columns_to_convert] - means) / stds


Unnamed: 0,LocationID,SessionID,GroupID,RecordID,start_time,end_time,latitude,longitude,Language,Survey_Version,...,RA_cp90,RA_cp95,THD_THD,THD_Min,THD_Max,THD_L5,THD_L10,THD_L50,THD_L90,THD_L95
0,Noorderplantsoen,Noorderplantsoen1,NP101,2,2020-03-11 08:54:00,2020-03-11 09:04:00,,,nld,nldSSIDv1,...,198.0,152.0,-6.0,-1312.0,5543.0,2294.0,1909.0,533.0,-993.0,-1104.0
1,Noorderplantsoen,Noorderplantsoen1,NP101,73,2020-03-13 00:49:00,2020-03-13 00:51:00,,,nld,nldSSIDv1,...,198.0,152.0,-6.0,-1312.0,5543.0,2294.0,1909.0,533.0,-993.0,-1104.0
2,Noorderplantsoen,Noorderplantsoen1,NP102,88,2020-03-13 12:04:00,2020-03-13 12:08:00,,,nld,nldSSIDv1,...,295.0,23.0,-275.0,-1402.0,6462.0,3921.0,323.0,1115.0,-1188.0,-126.0
3,Noorderplantsoen,Noorderplantsoen1,NP103,89,2020-03-13 12:12:00,2020-03-13 12:14:00,,,nld,nldSSIDv1,...,,,,,,,,,,
4,Noorderplantsoen,Noorderplantsoen1,NP106,98,2020-03-13 13:25:00,2020-03-13 13:32:00,,,nld,nldSSIDv1,...,257.0,203.0,-624.0,-737.0,6889.0,2914.0,2397.0,969.0,-352.0,-447.0


In [97]:
metadata

Unnamed: 0,GroupID,pleasant,chaotic,vibrant,uneventful,calm,annoying,eventful,monotonous,audio_path
0,NP101,1.095060,-0.569061,-0.454905,-0.081937,-0.177602,-0.937851,-0.933510,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
1,NP101,1.095060,-0.569061,-0.454905,-0.994947,0.688207,-0.937851,0.204205,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
2,NP102,-0.059192,-1.439390,-0.454905,0.831073,0.688207,-0.937851,0.204205,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
3,NP106,1.095060,0.301268,-0.454905,1.744084,-0.177602,1.066324,0.204205,0.457327,C:\Users\pepij\OneDrive - Delft University of ...
4,NP107,1.095060,0.301268,-0.454905,0.831073,0.688207,1.066324,-0.933510,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
...,...,...,...,...,...,...,...,...,...,...
73,NP158,1.095060,-0.569061,0.812330,-0.081937,0.688207,-0.937851,0.204205,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
74,NP160,1.095060,-1.439390,-1.722139,0.831073,0.688207,0.064236,-2.071226,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...
75,NP160,-0.059192,0.301268,0.812330,-0.994947,0.688207,0.064236,0.204205,0.457327,C:\Users\pepij\OneDrive - Delft University of ...
76,NP161,-0.059192,0.301268,0.812330,-0.994947,0.688207,1.066324,1.341921,-0.731724,C:\Users\pepij\OneDrive - Delft University of ...


In [54]:
import numpy as np
from datasets import Audio, ClassLabel
from transformers import ASTFeatureExtractor


In [55]:
def load_audio(row):
    waveform, sr = torchaudio.load(row['audio_path'])
    return {
        'filename': f"{row['GroupID']}.wav",
        'labels': {k: row[k] for k in ['pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous']},
        # 'labels': [
        #     row['pleasant'], row['chaotic'], row['vibrant'], row['uneventful'],
        #     row['calm'], row['annoying'], row['eventful'], row['monotonous']
        # ],
        'audio': {
            'path': row['audio_path'],
            'array': waveform.squeeze().numpy().reshape(-1),
            'sampling_rate': sr
        }
    }

# Step 1: Apply transformation to entire DataFrame
records = metadata.apply(load_audio, axis=1)

# Step 2: Apply to rows and collect results as list of dicts
records = [load_audio(row) for _, row in metadata.iterrows()]

# Step 3: Create a new DataFrame directly
new_df = pd.DataFrame(records)

In [56]:
dataset = Dataset.from_pandas(new_df)

In [57]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [58]:
dataset[0]

{'filename': 'NP101.wav',
 'labels': {'annoying': 1.0,
  'calm': 3.0,
  'chaotic': 2.0,
  'eventful': 3.0,
  'monotonous': 1.0,
  'pleasant': 5.0,
  'uneventful': 2.0,
  'vibrant': 4.0},
 'audio': {'path': None,
  'array': array([ 9.21832398e-05,  2.42332753e-04,  1.83250348e-04, ...,
         -2.55998340e-04, -3.29684844e-04,  0.00000000e+00]),
  'sampling_rate': 16000}}

In [59]:
dataset[0]['audio']['array'].shape

(988502,)

In [60]:
# dataset[0]
# De oude format als je eerst numpy array zou maken maar is dus niet nodig zie hierboven

In [61]:
# Define the pretrained model and instantiate the feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate

In [62]:
label_keys = ["pleasant", "chaotic", "vibrant", "uneventful", "calm", "annoying", "eventful", "monotonous"]

# Preprocessing function
def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")
    labels = [[row[k] for k in label_keys] for row in batch["labels"]]
    return {model_input_name: inputs[model_input_name], "labels": torch.tensor(labels, dtype=torch.float32)}


In [63]:
# split training data
if "test" not in dataset:
    dataset = dataset.train_test_split(
        test_size=0.2, shuffle=True, seed=0)

In [64]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
dataset = dataset.rename_column("audio", "input_values")

In [65]:
# Apply transforms
dataset["train"].set_transform(preprocess_audio, output_all_columns=False)
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)

In [67]:
dataset['test'][0]

{'input_values': tensor([[-0.6395, -1.0403, -0.6634,  ..., -0.6391, -0.8450, -1.0111],
         [-0.6513, -0.9367, -0.5599,  ..., -0.6642, -0.8131, -0.9661],
         [-0.6521, -1.1107, -0.7339,  ..., -0.5819, -0.7898, -1.1639],
         ...,
         [-0.9587, -1.1572, -0.7804,  ..., -0.9773, -1.0858, -1.2776],
         [-0.9500, -1.2287, -0.8519,  ..., -0.9749, -1.1984, -1.2776],
         [-0.9315, -1.1641, -0.7873,  ..., -0.9093, -1.0075, -1.2776]]),
 'labels': tensor([4., 2., 3., 5., 2., 3., 4., 3.])}

In [70]:
import evaluate
from transformers import ASTConfig, ASTForAudioClassification, TrainingArguments, Trainer

In [71]:
# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Modify the model's final layer for regression (8 outputs)
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.classifier = nn.Linear(config.hidden_size, 8)
model.init_weights()

In [76]:
from transformers import Trainer
import torch.nn as nn

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = nn.MSELoss()(logits, labels)
        return (loss, outputs) if return_outputs else loss



In [79]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./runs/ast_regressor_test",
    logging_dir="./logs/ast_regressor_test",
    report_to="none",
    learning_rate=5e-2,  # Learning rate
    push_to_hub=False,
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size
    eval_strategy="epoch",  # Evaluate after each epoch
    save_strategy="no", # Set false to save time
    save_total_limit=2,
    load_best_model_at_end=False, # Set false to save time
    metric_for_best_model="rmse",
    logging_strategy="steps",
    logging_steps=5, # Decrease to save time
    remove_unused_columns=False,
)

In [81]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    mse_value = mean_squared_error(labels, logits)
    rmse_value = np.sqrt(mse_value)
    mae_value = mean_absolute_error(labels, logits)
    r2_value = r2_score(labels, logits)

    return {
        "mse": mse_value,
        "rmse": rmse_value,
        "mae": mae_value,
        "r2": r2_value
    }


In [None]:
# OPTIONAL
# If you ever want per-attribute RMSE for better model diagnostics:

# def compute_metrics(eval_pred):
#     logits = eval_pred.predictions
#     labels = eval_pred.label_ids

#     metrics = {}
#     for i, name in enumerate(["pleasant", "chaotic", "vibrant", "uneventful", "calm", "annoying", "eventful", "monotonous"]):
#         mse = mean_squared_error(labels[:, i], logits[:, i])
#         rmse = np.sqrt(mse)
#         metrics[f"{name}_rmse"] = rmse

#     metrics["rmse"] = np.sqrt(mean_squared_error(labels, logits))
#     return metrics


In [82]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2
1,1372.2022,104.873871,104.873878,10.240795,9.324855,-109.490479


KeyboardInterrupt: 

In [95]:
trainer.predict(dataset["test"])

PredictionOutput(predictions=array([[ 7.0447845, 15.680937 ,  1.3311803, 11.641748 , 13.1720495,
        15.890659 , 13.096946 , 14.902811 ],
       [ 7.044776 , 15.680927 ,  1.3311661, 11.641735 , 13.172041 ,
        15.890648 , 13.096932 , 14.9028015],
       [ 7.044776 , 15.680927 ,  1.3311661, 11.641735 , 13.172041 ,
        15.890648 , 13.096932 , 14.9028015],
       [ 7.0447946, 15.680954 ,  1.3311999, 11.641763 , 13.172063 ,
        15.890678 , 13.096966 , 14.902826 ],
       [ 7.04479  , 15.680944 ,  1.3311898, 11.641752 , 13.172055 ,
        15.890673 , 13.096958 , 14.902819 ],
       [ 7.0448003, 15.6809635,  1.3312078, 11.64177  , 13.172068 ,
        15.89069  , 13.096977 , 14.902836 ],
       [ 7.044779 , 15.680928 ,  1.3311718, 11.641739 , 13.172044 ,
        15.890654 , 13.096937 , 14.902804 ],
       [ 7.0447917, 15.680948 ,  1.3311951, 11.641758 , 13.172059 ,
        15.890675 , 13.096962 , 14.902825 ],
       [ 7.0448046, 15.680967 ,  1.3312143, 11.641773 , 13.172072 ,