In [None]:
!pip install transformers datasets evaluate

In [None]:
!pip install wandb

In [None]:
!pip install --upgrade datasets

In [None]:
import wandb
wandb.login()

In [None]:
# finetune

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
from datasets import load_dataset

dataset = load_dataset("minoosh/IEMOCAP_Speech", use_auth_token=True)

Downloading readme:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/minoosh___parquet/minoosh--IEMOCAP_Speech-8dc0d71ba4cb25a9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/174M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/143M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/155M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating Session5 split:   0%|          | 0/1241 [00:00<?, ? examples/s]

Generating Session1 split:   0%|          | 0/1085 [00:00<?, ? examples/s]

Generating Session2 split:   0%|          | 0/1023 [00:00<?, ? examples/s]

Generating Session4 split:   0%|          | 0/1031 [00:00<?, ? examples/s]

Generating Session3 split:   0%|          | 0/1151 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/minoosh___parquet/minoosh--IEMOCAP_Speech-8dc0d71ba4cb25a9/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    Session5: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1241
    })
    Session1: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1085
    })
    Session2: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1023
    })
    Session4: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1031
    })
    Session3: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 1151
    })
})

In [3]:
dataset['Session1'][1]

{'audio': {'path': 'Ses01F_impro01_F001.wav',
  'array': array([ 0.00094604, -0.00094604, -0.0007019 , ..., -0.00045776,
         -0.00033569, -0.00128174]),
  'sampling_rate': 16000},
 'emotion': 2}

In [4]:
from datasets import load_metric

metric = load_metric("accuracy")

  This is separate from the ipykernel package so we can avoid doing imports until


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [5]:
labels = dataset["Session1"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label["2"]

'neu'

In [6]:
model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"

In [7]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

Downloading (…)rocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [8]:
max_duration = 10.0

In [9]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
    )
    return inputs

In [10]:
encoded_ds = dataset.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_ds

Map:   0%|          | 0/1241 [00:00<?, ? examples/s]

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

Map:   0%|          | 0/1023 [00:00<?, ? examples/s]

Map:   0%|          | 0/1031 [00:00<?, ? examples/s]

Map:   0%|          | 0/1151 [00:00<?, ? examples/s]

DatasetDict({
    Session5: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1241
    })
    Session1: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1085
    })
    Session2: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1023
    })
    Session4: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1031
    })
    Session3: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1151
    })
})

In [11]:
encoded_ds = encoded_ds.rename_column("emotion", "label")

In [12]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 4
args = TrainingArguments(
    f"{model_name}-finetuned-ie",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=30,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to='wandb',
    run_name='finetune_AST_on_IEMOCAP_speech'
)

In [14]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [15]:
import torch
train_dataset = torch.utils.data.ConcatDataset([encoded_ds['Session1'], encoded_ds['Session2'], encoded_ds['Session3']])

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=encoded_ds["Session4"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/minoosh/ast-finetuned-audioset-10-10-0.4593-finetuned-ie into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/329M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/329M [00:00<?, ?B/s]

In [17]:
import torch
torch.cuda.empty_cache()

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy
1,0.986,0.991613,0.557711
2,0.7885,1.146606,0.543162




KeyboardInterrupt: 

In [19]:
trainer.train(resume_from_checkpoint=True )

0it [00:00, ?it/s]



Epoch,Training Loss,Validation Loss,Accuracy
3,0.666,0.955674,0.612997
4,0.3633,1.253167,0.592629
5,0.2045,1.695223,0.584869
6,0.0829,1.907955,0.580989
7,0.0544,2.169765,0.563531
8,0.0203,2.023317,0.625606
9,0.0223,2.275176,0.618817
10,0.0043,2.751239,0.600388
11,0.0203,2.934447,0.589719
12,0.0484,2.676682,0.623666




KeyboardInterrupt: 

In [20]:
trainer.train(resume_from_checkpoint=True )

0it [00:00, ?it/s]



Epoch,Training Loss,Validation Loss,Accuracy
14,0.0079,3.225941,0.585839
15,0.0003,2.843218,0.617847
16,0.0001,3.257566,0.589719
17,0.0,3.078962,0.606208
18,0.0,3.146841,0.600388
18,0.0,3.069843,0.607575




KeyboardInterrupt: 

In [21]:
trainer.evaluate(encoded_ds['Session5'])

{'eval_loss': 3.069843053817749, 'eval_accuracy': 0.6075745366639806}

In [22]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/329M [00:00<?, ?B/s]

To https://huggingface.co/minoosh/ast-finetuned-audioset-10-10-0.4593-finetuned-ie
   39bdea9..bb696ff  main -> main

To https://huggingface.co/minoosh/ast-finetuned-audioset-10-10-0.4593-finetuned-ie
   bb696ff..fc54f91  main -> main



'https://huggingface.co/minoosh/ast-finetuned-audioset-10-10-0.4593-finetuned-ie/commit/bb696fffbbe1646072c6b7042677973970ba088f'