In [1]:
# so notebook can access the folder
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Audio Transformer

Resources:
- https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5
- https://renumics.com/blog/how-to-fine-tune-the-audio-spectrogram-transformer

In [3]:
import os
import json
import pandas as pd
data_folder = '/content/drive/MyDrive/DS565_Project/'

with open(os.path.join(data_folder, 'metadata.json'), 'r') as f_in:
  metadata = json.load(f_in)

train = pd.read_csv(os.path.join(data_folder, 'train.csv'))
val = pd.read_csv(os.path.join(data_folder, 'val.csv'))
test = pd.read_csv(os.path.join(data_folder, 'test.csv'))

print(
    len(metadata),
    len(train),
    len(val),
    len(test)
)

assert len(metadata) == len(train) + len(val) + len(test)

10039 6023 2008 2008


In [4]:
train.head()

Unnamed: 0,name,emotion
0,Ses04F_script02_2_F009,ang
1,Ses04M_script01_2_F005,ang
2,Ses05F_impro03_F038,xxx
3,Ses04F_script02_2_F021,sad
4,Ses04M_script01_1_M019,fru


# Demo

Test on dummy

In [6]:
dummy = train.loc[0, :].tolist()
dummy

['Ses04F_script02_2_F009', 'ang']

In [7]:
dummy_meta = metadata[dummy[0]]
dummy_meta

['[98.2800 - 102.1400]', 'ang', 'Session4', 'Ses04F_script02_2']

In [8]:
from scipy.io import wavfile
iemocap = os.path.join(data_folder, 'IEMOCAP_full_release')
dummy_wav = os.path.join(iemocap, dummy_meta[2], 'sentences', 'wav', dummy_meta[3], dummy[0]+'.wav')

wavfile.read(dummy_wav) # it works!

(16000, array([ 146,  134,  125, ...,  551,   70, -484], dtype=int16))

## Load audio in torchaudio to use in models

> https://pytorch.org/audio/main/generated/torchaudio.transforms.Spectrogram.html



In [9]:
!pip install torch torchaudio torchvision



In [10]:
import torchaudio

waveform, sample_rate = torchaudio.load(dummy_wav, normalize=True)
transform = torchaudio.transforms.MelSpectrogram(n_fft=800)
spectrogram = transform(waveform)
spectrogram.shape

torch.Size([1, 128, 155])

In [11]:
waveform.squeeze().shape

torch.Size([61760])

## Demo model predictions
> https://huggingface.co/docs/transformers/en/model_doc/audio-spectrogram-transformer

In [12]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [13]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

In [14]:
import torch
inputs = feature_extractor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = model.config.id2label[predicted_class_ids]
print(predicted_label)

# compute loss - target_label is e.g. "down"
target_label = model.config.id2label[0]
inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
loss = model(**inputs).loss
round(loss.item(), 2)

Speech


0.15