In [122]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} for training.")


Using cpu for training.


In [2]:
import librosa

In [111]:
from transformers import ASTForAudioClassification, ASTFeatureExtractor, ASTConfig
# Modify the classifier layer to match your task
import torch.nn as nn

# Load the feature extractor
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)

config = ASTConfig.from_pretrained(model_name)

config.num_labels = 8  # Change the number of labels to match your task

# Load AST model but ignore mismatched sizes
model = ASTForAudioClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True
)

model_input_name = feature_extractor.model_input_names[0]

model.init_weights()
model.classifier = nn.Linear(model.config.hidden_size, 8)  # Replace classifier for 8 outputs
model.init_weights()
model.to(device) 


print("Model loaded successfully with modified classifier!")

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully with modified classifier!


In [112]:
model_input_name

'input_values'

In [113]:
import pandas as pd
import numpy as np
import os

# Load metadata CSV (assuming dataset includes a CSV file linking audio to perceptual attributes)
metadata = pd.read_excel(r"C:\Users\pepij\Downloads\noorderplantsoen.xlsx")

# Preview dataset
(metadata.head())

metadata["audio_path"] = metadata["GroupID"].apply(lambda x: r"C:\Users\pepij\OneDrive - Delft University of Technology\THESIS\data\WAV_Groningen_1\WAV_Groningen_1\Noorderplantsoen\NP" + x[2:] + ".wav")

# Keep only rows where the file exists
metadata = metadata[metadata["audio_path"].apply(os.path.exists)]

# Reset index after filtering
metadata.reset_index(drop=True, inplace=True)

In [114]:
metadata = metadata[['GroupID', 'pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'audio_path']]

columns_to_convert = [
    "pleasant", "chaotic", "vibrant", "uneventful", 
    "calm", "annoying", "eventful", "monotonous"
]

metadata[columns_to_convert] = metadata[columns_to_convert].astype(float)

In [115]:
import soundfile as sf

def preprocess_audio(audio_path):
    waveform, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
    # Explicitly normalize waveform
    waveform = waveform / max(abs(waveform))  # Ensure range [-1,1]
    features = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt")
    return features.input_values.squeeze(0)

# Apply preprocessing to dataset
metadata["input_features"] = metadata["audio_path"].apply(preprocess_audio)


In [116]:
def preprocess_audio(batch):
    # Extract waveform data from dataset
    wavs = [audio["array"] for audio in batch["input_values"]]  

    # Convert waveforms into spectrograms using AST's feature extractor
    inputs = feature_extractor(wavs, sampling_rate=16000, return_tensors="pt", padding=True)

    # Return formatted output with correct labels
    output_batch = {
        "input_values": inputs.input_values,  # Spectrograms as input
        "labels": batch["labels"],  # Keep labels unchanged
    }
    return output_batch


In [117]:
# Rename the audio column if necessary
hf_dataset = hf_dataset.rename_column("input_features", "input_values")

# Apply preprocessing to the dataset
hf_dataset.set_transform(preprocess_audio)


ValueError: Original column name input_features not in the dataset. Current columns in the dataset: ['pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'input_values']

In [118]:
from torch.utils.data import Dataset, DataLoader

class SoundscapeDataset(Dataset):
    def __init__(self, metadata):
        self.metadata = metadata

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        # Extract features
        features = self.metadata.iloc[idx]["input_features"]

        # Convert labels to float32 (fixes the error)
        labels = self.metadata.iloc[idx][["pleasant", "vibrant", "eventful", "chaotic", 
                                          "annoying", "monotonous", "uneventful", "calm"]].astype(float).values
        labels = torch.tensor(labels, dtype=torch.float32)  # Convert to tensor

        return features, labels


# Create dataset
dataset = SoundscapeDataset(metadata)

# Split into training & validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [121]:
dataset.__init__

<bound method SoundscapeDataset.__init__ of <__main__.SoundscapeDataset object at 0x0000023B9EB8ACD0>>

In [8]:
# import torch.optim as optim
# import torch.nn as nn

# criterion = nn.MSELoss()  # Regression loss function
# optimizer = optim.AdamW(model.parameters(), lr=1e-5)  # Use AdamW optimizer


In [9]:
# epochs = 10  # Adjust based on performance

# for epoch in range(epochs):
#     model.train()
#     total_loss = 0

#     for batch in train_loader:
#         inputs, targets = batch
#         inputs, targets = inputs.to(device), targets.to(device)

#         optimizer.zero_grad()
#         outputs = model(inputs).logits  # Forward pass
#         loss = criterion(outputs, targets)  # Compute loss

#         loss.backward()  # Backpropagation
#         optimizer.step()  # Update weights

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}/{epochs}, Training Loss: {total_loss/len(train_loader)}")


In [85]:
from transformers import TrainingArguments

# Configure training arguments
training_args = TrainingArguments(
    output_dir="./runs/ast_regressor",
    logging_dir="./logs/ast_regressor",
    report_to="tensorboard",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rmse",
    logging_strategy="steps",
    logging_steps=20,
    remove_unused_columns=False,
)




In [86]:
import evaluate
import numpy as np

mse = evaluate.load("mse")

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    mse_value = mse.compute(predictions=logits, references=labels)["mse"]
    rmse_value = np.sqrt(mse_value)  # Compute RMSE

    return {"mse": mse_value, "rmse": rmse_value}


In [89]:
# Convert Subset objects back to DataFrame
train_df = metadata.iloc[train_dataset.indices].reset_index(drop=True)
val_df = metadata.iloc[val_dataset.indices].reset_index(drop=True)

display(train_df.head())

# Ensure input_features is a list instead of a tensor
train_df["input_features"] = train_df["input_features"].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else x)
val_df["input_features"] = val_df["input_features"].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else x)


Unnamed: 0,GroupID,pleasant,chaotic,vibrant,uneventful,calm,annoying,eventful,monotonous,audio_path,input_features
0,NP102,4.0,1.0,4.0,3.0,4.0,1.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[tensor(-0.7477), tensor(-1.0871), tensor(-0...."
1,NP136,4.0,1.0,5.0,2.0,4.0,1.0,4.0,2.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[tensor(-1.1476), tensor(-1.2776), tensor(-0...."
2,NP153,4.0,2.0,5.0,1.0,1.0,1.0,5.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[tensor(-1.0547), tensor(-1.2776), tensor(-1...."
3,NP146,4.0,4.0,5.0,1.0,4.0,3.0,5.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[tensor(-0.9345), tensor(-1.2776), tensor(-0...."
4,NP116,5.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[tensor(-0.9793), tensor(-1.2480), tensor(-0...."


In [90]:
train_df

Unnamed: 0,GroupID,pleasant,chaotic,vibrant,uneventful,calm,annoying,eventful,monotonous,audio_path,input_features
0,NP102,4.0,1.0,4.0,3.0,4.0,1.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-0.7477487325668335, -1.0871058702468872, -0..."
1,NP136,4.0,1.0,5.0,2.0,4.0,1.0,4.0,2.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-1.1475975513458252, -1.2775938510894775, -0..."
2,NP153,4.0,2.0,5.0,1.0,1.0,1.0,5.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-1.0547019243240356, -1.2775938510894775, -1..."
3,NP146,4.0,4.0,5.0,1.0,4.0,3.0,5.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-0.9345070123672485, -1.2775938510894775, -0..."
4,NP116,5.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-0.9792704582214355, -1.248040795326233, -0...."
...,...,...,...,...,...,...,...,...,...,...,...
57,NP155,4.0,3.0,5.0,1.0,2.0,2.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-0.9310088753700256, -1.273632287979126, -0...."
58,NP140,4.0,4.0,5.0,2.0,4.0,2.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-1.061864972114563, -1.1639467477798462, -0...."
59,NP154,3.0,4.0,5.0,1.0,4.0,1.0,4.0,1.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-1.0211067199707031, -1.2775938510894775, -0..."
60,NP133,3.0,3.0,4.0,3.0,2.0,2.0,3.0,2.0,C:\Users\pepij\OneDrive - Delft University of ...,"[[-0.9808656573295593, -1.2775938510894775, -0..."


In [91]:
from datasets import Dataset, DatasetDict

# Convert to Hugging Face Dataset format
hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(val_df),
})

# Remove unnecessary columns (adjust based on your dataset)
hf_dataset = hf_dataset.remove_columns(["audio_path", "GroupID"])


In [92]:
# Convert `input_features` from lists to PyTorch tensors
def convert_to_tensor(example):
    example["input_features"] = torch.tensor(example["input_features"], dtype=torch.float32)
    return example

hf_dataset = hf_dataset.map(convert_to_tensor)


Map: 100%|██████████| 62/62 [00:04<00:00, 12.81 examples/s]
Map: 100%|██████████| 16/16 [00:01<00:00, 12.53 examples/s]


In [None]:
from torch.utils.data import default_collate

# Custom data collator for AST
def data_collator(features):
    input_values = torch.stack([torch.tensor(f["input_features"], dtype=torch.float32) for f in features])
    labels = torch.stack([torch.tensor([f[col] for col in ["pleasant", "vibrant", "eventful", 
                                                           "chaotic", "annoying", "monotonous", 
                                                           "uneventful", "calm"]], dtype=torch.float32) 
                          for f in features])
    return {"input_values": input_values, "labels": labels}


In [93]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    compute_metrics=compute_metrics,
    # data_collator=data_collator  # Use the corrected collator
)


In [94]:
trainer.train()

TypeError: ASTForAudioClassification.forward() got an unexpected keyword argument 'pleasant'

In [70]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'input_features'],
        num_rows: 62
    })
    test: Dataset({
        features: ['pleasant', 'chaotic', 'vibrant', 'uneventful', 'calm', 'annoying', 'eventful', 'monotonous', 'input_features'],
        num_rows: 16
    })
})

In [75]:
hf_dataset["test"].features['pleasant']

Value(dtype='float64', id=None)

In [77]:
hf_dataset["test"].features['vibrant']

Value(dtype='float64', id=None)