In [1]:
# so notebook can access the folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
!pip install torch torchaudio
!pip install datasets
!pip install pandas
!pip install transformers
!pip install evaluate
!pip install accelerate



# Finetune using train dataset

Resources:
- https://renumics.com/blog/how-to-fine-tune-the-audio-spectrogram-transformer#4-configure-and-initialize-the-ast-for-fine-tuning
- https://huggingface.co/docs/transformers/en/model_doc/audio-spectrogram-transformer

In [12]:
import os
import json
import pandas as pd
data_folder = '/content/drive/MyDrive/DS565_Project/'
iemocap = os.path.join(data_folder, 'IEMOCAP_full_release')

with open(os.path.join(data_folder, 'metadata.json'), 'r') as f_in:
  metadata = json.load(f_in)

train = pd.read_csv(os.path.join(data_folder, 'train.csv'))
val = pd.read_csv(os.path.join(data_folder, 'val.csv'))
test = pd.read_csv(os.path.join(data_folder, 'test.csv'))

print(
    len(metadata),
    len(train),
    len(val),
    len(test)
)

assert len(metadata) == len(train) + len(val) + len(test)

10039 6023 2008 2008


## Load dataset into current runtime (OUTDATED)

By loading dataset to current runtime, it makes loading the audio files much faster

In [8]:
"""
#THIS TAKES FOREVER
import shutil
import os

sessions = set([val[2] for val in metadata.values()])
for session in sessions:
  print(session)
  if not os.path.exists(session):
    os.makedirs(session)

  session_wav = os.path.join(iemocap, session, 'sentences', 'wav')
  shutil.copytree(session_wav, os.path.join(session, 'sentences', 'wav'))"""


Session1
Session2


KeyboardInterrupt: 

In [10]:
!tar -xzvf /content/drive/MyDrive/DS565_Project/IEMOCAP_full_release.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.phseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.phseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.stseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.stseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.syseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.syseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_script01_2_F002.wdseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/Ses01F_script01_2_F002.wdseg
./IEMOCAP_full_release/Session1/sentences/ForcedAlignment/Ses01F_script01_2/._Ses01F_sc

## Load custom dataset as huggingface Dataset Object

Since all the file names are not paths, we get the full paths

In [14]:
def get_wav_path(wav_name:str) -> str:
  wav_metadata = metadata[wav_name]

  session = wav_metadata[2]
  improv_script = wav_metadata[3]

  return os.path.join('/content/IEMOCAP_full_release', session, 'sentences', 'wav', improv_script, wav_name+'.wav')

train['path'] = train['name'].apply(lambda x: (get_wav_path(x)))

#for path in train['path']: assert os.path.exists(path)

train.head()

Unnamed: 0,name,emotion,path
0,Ses02M_impro08_M013,6,/content/IEMOCAP_full_release/Session2/sentenc...
1,Ses01M_impro02_M012,8,/content/IEMOCAP_full_release/Session1/sentenc...
2,Ses04F_script03_2_M020,10,/content/IEMOCAP_full_release/Session4/sentenc...
3,Ses04F_script01_1_F042,0,/content/IEMOCAP_full_release/Session4/sentenc...
4,Ses05M_script03_2_M016,4,/content/IEMOCAP_full_release/Session5/sentenc...


Label Mapping

In [15]:
with open(os.path.join(data_folder, 'label_mapping.json'), 'r') as f_in:
  label_mapping = json.load(f_in)

label_mapping

{'0': 'ang',
 '1': 'dis',
 '2': 'exc',
 '3': 'fea',
 '4': 'fru',
 '5': 'hap',
 '6': 'neu',
 '7': 'oth',
 '8': 'sad',
 '9': 'sur',
 '10': 'xxx'}

## Preprocess Function

In [16]:
def preprocess_data(df: pd.DataFrame):
  data_folder = '/content/drive/MyDrive/DS565_Project/'
  iemocap = os.path.join(data_folder, 'IEMOCAP_full_release')

  df['path'] = df['name'].apply(lambda x: get_wav_path(x))

  return df.loc[:, ['path', 'emotion']]

preprocess_data(train).head()

Unnamed: 0,path,emotion
0,/content/IEMOCAP_full_release/Session2/sentenc...,6
1,/content/IEMOCAP_full_release/Session1/sentenc...,8
2,/content/IEMOCAP_full_release/Session4/sentenc...,10
3,/content/IEMOCAP_full_release/Session4/sentenc...,0
4,/content/IEMOCAP_full_release/Session5/sentenc...,4


## Instantiate a Huggingface Dataset object from Pandas

In [17]:
from datasets import Dataset, Audio, ClassLabel, Features, Value

def dataset_from_pandas(df):
  df = preprocess_data(df)

  # Define class labels
  class_labels = ClassLabel(names=list(label_mapping.values()))

  # Define features with audio and label columns
  features = Features({
      "input_values": Audio(),  # Define the audio feature
      "labels": class_labels  # Assign the class labels
  })

  # Construct the dataset from a dictionary
  dataset = Dataset.from_dict({
      "input_values": df['path'],
      "labels": df['emotion']
  }
      , features=features)

  return dataset

train_dataset = dataset_from_pandas(train)
val_dataset = dataset_from_pandas(val)
test_dataset = dataset_from_pandas(test)

train_dataset[0]['input_values']

{'path': '/content/IEMOCAP_full_release/Session2/sentences/wav/Ses02M_impro08/Ses02M_impro08_M013.wav',
 'array': array([-0.00680542, -0.00628662, -0.00576782, ..., -0.003479  ,
        -0.00460815, -0.00488281]),
 'sampling_rate': 16000}

In [18]:
train_dataset

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 6023
})

In [19]:
val_dataset

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2008
})

In [20]:
test_dataset

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2008
})

## Calculate mean and std of training data

- Needed to normalize the feature extractor
- Only used on training data to avoid data leakage

In [21]:
"""import numpy as np
from tqdm import tqdm

mean_list = []
std_list = []

for i in tqdm(range(1000)):
  # only do it for 1000 samples because takes toooooo long
    mean_list.append(np.mean(train_dataset[i]['input_values']['array']))
    std_list.append(np.std(train_dataset[i]['input_values']['array']))

train_mean = np.mean(mean_list)
train_std = np.mean(std_list)

print(train_mean, train_std)"""

train_mean = -2.392645886576044e-05
train_std = 0.025430383037778646

Since it took a long time to run, save here:
```
train_mean = -2.392645886576044e-05
train_std = 0.025430383037778646
```

## Batch Feature Extract data

In [22]:
from transformers import AutoFeatureExtractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model)

feature_extractor.mean = train_mean
feature_extractor.std = train_std
feature_extractor.do_normalize = True
feature_extractor

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -2.392645886576044e-05,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 0.025430383037778646
}

## Setup Config and Initialize Model

In [23]:
from transformers import ASTConfig, ASTForAudioClassification

import torch

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Update configuration with the number of labels in our dataset
config.num_labels = len(label_mapping)
config.label2id = {v:k for k,v in label_mapping.items()}
config.id2label = label_mapping

# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config,
                                                  attn_implementation="sdpa",
                                                  #torch_dtype=torch.float16,
                                                  ignore_mismatched_sizes=True)
model.init_weights()

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

## Training

In [25]:
from transformers import TrainingArguments

# Configure training run with TrainingArguments class
training_args = TrainingArguments(
    output_dir="./runs/ast_classifier",
    logging_dir="./logs/ast_classifier",
    report_to="tensorboard",
    learning_rate=5e-5,  # Learning rate
    push_to_hub=False,
    num_train_epochs=5,  # Number of epochs
    per_device_train_batch_size=8,  # Batch size per device
    eval_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",
    eval_steps=1,
    save_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_strategy="steps",
    logging_steps=20,
)

### Evaluation Metrics

In [26]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

AVERAGE = "macro" if config.num_labels > 2 else "binary"

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=1)
    metrics = accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
    metrics.update(precision.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(recall.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    metrics.update(f1.compute(predictions=predictions, references=eval_pred.label_ids, average=AVERAGE))
    return metrics

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [27]:
from transformers import Trainer

# Setup the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [35]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

train_dataset.set_format('torch', device=device)
val_dataset.set_format('torch', device=device)
test_dataset.set_format('torch', device=device)

model = model.to(device)
print(device)

cuda:0


In [36]:
# how to preprocess each batch of data before inputting to model
def batch_feat_extract(batch):
  wavs = [audio["array"] for audio in batch["input_values"]]
  inputs = feature_extractor(wavs, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
  output_batch = {feature_extractor.model_input_names[0]: inputs.get(feature_extractor.model_input_names[0]), "labels": list(batch["labels"])}
  return output_batch

train_dataset.set_transform(batch_feat_extract, output_all_columns=False)
val_dataset.set_transform(batch_feat_extract, output_all_columns=False)
test_dataset.set_transform(batch_feat_extract, output_all_columns=False)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8654,1.710314,0.319721,0.20921,0.217176,0.181449
2,1.7466,1.707347,0.308267,0.240615,0.18671,0.147606
3,1.4776,1.680362,0.330677,0.216661,0.245834,0.21195
4,1.7302,1.648569,0.341135,0.237213,0.250533,0.237083
5,1.5079,1.648051,0.353088,0.298268,0.258811,0.252115


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Non-default generation parameters: {'max_length': 1024}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Non-default generation parameters: {'max_length': 1024}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Non-default generation parameters: {'max_length': 1024}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Non-default generation parameters: {'max_length': 1024}


TrainOutput(global_step=3765, training_loss=1.6445981432242223, metrics={'train_runtime': 1777.8362, 'train_samples_per_second': 16.939, 'train_steps_per_second': 2.118, 'total_flos': 2.041444136217477e+18, 'train_loss': 1.6445981432242223, 'epoch': 5.0})

In [47]:
trainer.save_model("/content/drive/MyDrive/DS565_Project/models/nealson_ast_1")

Non-default generation parameters: {'max_length': 1024}


In [44]:
!zip -r /content/drive/MyDrive/DS565_Project/models ./runs

  adding: runs/ (stored 0%)
  adding: runs/ast_classifier/ (stored 0%)
  adding: runs/ast_classifier/checkpoint-3012/ (stored 0%)
  adding: runs/ast_classifier/checkpoint-3012/config.json (deflated 52%)
  adding: runs/ast_classifier/checkpoint-3012/model.safetensors (deflated 7%)
  adding: runs/ast_classifier/checkpoint-3012/training_args.bin (deflated 52%)
  adding: runs/ast_classifier/checkpoint-3012/optimizer.pt (deflated 7%)
  adding: runs/ast_classifier/checkpoint-3012/rng_state.pth (deflated 25%)
  adding: runs/ast_classifier/checkpoint-3012/trainer_state.json (deflated 79%)
  adding: runs/ast_classifier/checkpoint-3012/scheduler.pt (deflated 55%)
  adding: runs/ast_classifier/checkpoint-753/ (stored 0%)
  adding: runs/ast_classifier/checkpoint-753/config.json (deflated 52%)
  adding: runs/ast_classifier/checkpoint-753/model.safetensors (deflated 6%)
  adding: runs/ast_classifier/checkpoint-753/training_args.bin (deflated 52%)
  adding: runs/ast_classifier/checkpoint-753/optimize