In [1]:
!pip install -q datasets[audio]==2.16.0 transformers==4.37.2 wandb

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.3.0 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2024.1.0 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2024.1.0 which is incompatible.
dask-cud

In [2]:
from datasets import load_dataset, Dataset, Audio, DatasetDict
from transformers import AutoFeatureExtractor, ASTModel, ASTForAudioClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedShuffleSplit
import os
import torch
import torchaudio
import numpy as np

2024-02-07 21:27:16.491699: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-07 21:27:16.491813: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-07 21:27:16.627746: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
#os.environ["WANDB_PROJECT"]=
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"
#os.environ["WANDB_API_KEY"]= hidden for public

### Creating a label dictionary for the model

In [4]:
def label_dict(path):
    folder_names = sorted(os.listdir(path))

    id2label = {i: label for i, label in enumerate(folder_names)}
    label2id = {label: i for i, label in enumerate(folder_names)}

    return id2label, label2id

model_name = 'MIT/ast-finetuned-audioset-10-10-0.4593'  
data_path = '/kaggle/input/mallaudio-v2/MallProjectAudio_v2'  
num_classes = len(os.listdir(data_path))

id2label, label2id = label_dict(data_path)

### Use stratified shuffle split to ensure that every class is present equally

In [5]:
labels, file_paths = zip(*[(class_name, os.path.join(data_path, class_name, filename))
                           for class_name in os.listdir(data_path)
                           for filename in os.listdir(os.path.join(data_path, class_name))
                           if filename.endswith(".wav")])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)  # 80% train, 20% val
train_index, val_index = next(sss.split(file_paths, labels))

train_file_paths = [file_paths[i] for i in train_index]
train_labels = [labels[i] for i in train_index]
val_file_paths = [file_paths[i] for i in val_index]
val_labels = [labels[i] for i in val_index]

### Creating of a DatasetDict. Cast column with Audio to ensure the sample rate is 1600 and audio is processed into arrays

In [6]:
train_dataset = Dataset.from_dict({"audio": train_file_paths, 'label': [label2id[x] for x in train_labels]}).cast_column("audio", Audio(sampling_rate=16000))
val_dataset = Dataset.from_dict({"audio": val_file_paths, 'label': [label2id[x] for x in val_labels]}).cast_column("audio", Audio(sampling_rate=16000))
dataset_dict = DatasetDict({"train": train_dataset, "val": val_dataset})

In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 4324
    })
    val: Dataset({
        features: ['audio', 'label'],
        num_rows: 1082
    })
})

classifier): ASTMLPHead(
    (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dense): Linear(in_features=768, out_features=527, bias=True)
  )

### Custom class for Audio Spectrogram Transformer for own number of classes

In [8]:
from transformers import AutoModel, AutoConfig
import torch.nn as nn

class ASTCustomModel(nn.Module):
    def __init__(self, base_model_name, num_classes):
        super(ASTCustomModel, self).__init__()

        # Load the base model configuration
        config = AutoConfig.from_pretrained(base_model_name)

        # Load the base model
        self.base_model = AutoModel.from_pretrained(base_model_name, config=config)

        # Add your custom classifier head with layer normalization
        self.classifier_head = nn.Sequential(
            nn.LayerNorm(768, eps=1e-12),  # Add layer normalization
            nn.Linear(768, num_classes)
        )

    def forward(self, input_values, labels=None):
        # Forward pass through the base model
        outputs = self.base_model(input_values=input_values)

        # Extract the last hidden states
        last_hidden_states = outputs.last_hidden_state

        # You may use the entire sequence or just the [CLS] token embeddings for classification
        # Here, we use the [CLS] token
        cls_token = last_hidden_states[:, 0, :]

        # Forward pass through the classifier head
        logits = self.classifier_head(cls_token)
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return loss, logits

In [9]:
model = ASTCustomModel(model_name, num_classes)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [10]:
model

ASTCustomModel(
  (base_model): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dense): Linear(in_features=768, 

### Extract the features (log-Mel spectrogram) and encode the dataset into tensors compatible with the model

In [11]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate,
        return_tensors='pt'
    )
    return inputs

In [12]:
encoded_dataset = dataset_dict.map(preprocess_function, remove_columns=['audio'], batched=True)
encoded_dataset

Map:   0%|          | 0/4324 [00:00<?, ? examples/s]

Map:   0%|          | 0/1082 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 4324
    })
    val: Dataset({
        features: ['label', 'input_values'],
        num_rows: 1082
    })
})

In [13]:
training_args = TrainingArguments(
    output_dir='results',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    save_total_limit=3
)

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 
            'f1': f1, 
            'precision': precision, 
            'recall': recall}

### Custom data collator to apply data augmentations

In [15]:
from transformers import DefaultDataCollator
import torchaudio.transforms as T
import torch

class AudioSpectrogramDataCollator(DefaultDataCollator):
    def __init__(self, freq_mask_time=100, time_mask_num=10):
        super(AudioSpectrogramDataCollator, self).__init__()

        self.freq_mask_time = freq_mask_time
        self.time_mask_num = time_mask_num

    def mask_spec(self, spec):
        # Apply frequency masking
        f_mask = T.FrequencyMasking(freq_mask_param=self.freq_mask_time)
        spec = f_mask(spec)

        # Apply time masking
        t_mask = T.TimeMasking(time_mask_param=self.time_mask_num)
        spec = t_mask(spec)

        return spec

    def collate_batch(self, features):
        batch = super().collate_batch(features)

        # Assume input_values is already a PyTorch tensor (mel spectrogram)
        audio_input = batch["input_values"]

        # Apply data augmentation
        augmented_spec = self.mask_spec(audio_input)

        # Update the batch with augmented mel spectrogram
        batch["input_values"] = augmented_spec

        return batch
    
data_collator = AudioSpectrogramDataCollator(freq_mask_time=100, time_mask_num=10)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['val'],
    #tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [17]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mluftpro[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240207_213005-lk0ami5p[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mpeachy-surf-7[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/luftpro/mall_audio_recognition_AST[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/luftpro/mall_audio_recognition_AST/runs/lk0ami5p[0m


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,5.5741,5.438179,0.019409,0.00571,0.005973,0.019409
1,4.0709,3.930888,0.270795,0.234094,0.31447,0.270795
2,2.196,2.495474,0.695933,0.678846,0.753954,0.695933
4,0.3127,0.995134,0.899261,0.896782,0.917678,0.899261
5,0.0951,0.664833,0.926063,0.922616,0.93769,0.926063
6,0.022,0.499493,0.924214,0.921334,0.936103,0.924214
8,0.0463,0.296999,0.941774,0.940018,0.952888,0.941774
9,0.0162,0.264577,0.937153,0.935873,0.94991,0.937153
10,0.0003,0.247182,0.944547,0.943172,0.95518,0.944547
12,0.0045,0.227717,0.942699,0.941345,0.953689,0.942699


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'wandb' is not defined

In [None]:
trainer.evaluate()

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()