In [24]:
from datasets import Dataset, Audio, ClassLabel, Features, load_dataset
import pandas as pd
import numpy as np
import os
import torchaudio
import torch.nn as nn
# Load metadata CSV (assuming dataset includes a CSV file linking audio to perceptual attributes)
metadata = pd.read_excel(r"C:\Users\pepij\Downloads\noorderplantsoen.xlsx")

# Preview dataset
display(metadata.head())

metadata["audio_path"] = metadata["GroupID"].apply(lambda x: r"C:\Users\pepij\OneDrive - Delft University of Technology\THESIS\data\WAV_Groningen_1\WAV_Groningen_1\Noorderplantsoen\NP" + x[2:] + ".wav")

# Keep only rows where the file exists
metadata = metadata[metadata["audio_path"].apply(os.path.exists)]

# Reset index after filtering
metadata.reset_index(drop=True, inplace=True)

metadata = metadata[['GroupID', 'pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'audio_path']]

columns_to_convert = [
    "pleasant", "chaotic", "vibrant", "uneventful", 
    "calm", "annoying", "eventful", "monotonous"
]

metadata[columns_to_convert] = metadata[columns_to_convert].astype(float).values
import numpy as np
from datasets import Audio, ClassLabel
from transformers import ASTFeatureExtractor

def load_audio(row):
    waveform, sr = torchaudio.load(row['audio_path'])
    return {
        'filename': f"{row['GroupID']}.wav",
        'labels': {k: row[k] for k in ['pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous']},
        # 'labels': [
        #     row['pleasant'], row['chaotic'], row['vibrant'], row['uneventful'],
        #     row['calm'], row['annoying'], row['eventful'], row['monotonous']
        # ],
        'audio': {
            'path': row['audio_path'],
            'array': waveform.squeeze().numpy().reshape(-1),
            'sampling_rate': sr
        }
    }

# Step 1: Apply transformation to entire DataFrame
records = metadata.apply(load_audio, axis=1)

# Step 2: Apply to rows and collect results as list of dicts
records = [load_audio(row) for _, row in metadata.iterrows()]

# Step 3: Create a new DataFrame directly
new_df = pd.DataFrame(records)
dataset = Dataset.from_pandas(new_df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset[0]
dataset[0]['audio']['array'].shape
# dataset[0]
# De oude format als je eerst numpy array zou maken maar is dus niet nodig zie hierboven
# Define the pretrained model and instantiate the feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate
# Preprocessing function
def preprocess_audio(batch):
    wavs = [audio["array"] for audio in batch["input_values"]]
    inputs = feature_extractor(wavs, sampling_rate=SAMPLING_RATE, return_tensors="pt")
    return {model_input_name: inputs.get(model_input_name), "labels": list(batch["labels"])}
# split training data
if "test" not in dataset:
    dataset = dataset.train_test_split(
        test_size=0.2, shuffle=True, seed=0)
dataset
# Apply transforms
dataset["train"].set_transform(preprocess_audio, output_all_columns=False)
dataset["test"].set_transform(preprocess_audio, output_all_columns=False)
import evaluate
from transformers import ASTConfig, ASTForAudioClassification, TrainingArguments, Trainer
# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Modify the model's final layer for regression (8 outputs)
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.classifier = nn.Linear(config.hidden_size, 8)
model.init_weights()
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./runs/ast_regressor",
    logging_dir="./logs/ast_regressor",
    report_to="tensorboard",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size
    eval_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rmse",
    logging_strategy="steps",
    logging_steps=20,
    remove_unused_columns=False,
)
mse = evaluate.load("mse")

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    mse_value = mse.compute(predictions=logits, references=labels)["mse"]
    rmse_value = np.sqrt(mse_value)  # Compute RMSE

    return {"mse": mse_value, "rmse": rmse_value}