In [1]:
!pip install transformers datasets evaluate

[0m

In [2]:
!pip install wandb

[0m

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
!pip install --upgrade datasets

[0m

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from datasets import load_dataset

dataset = load_dataset("minoosh/IEMOCAP_Speech", use_auth_token=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
from datasets import load_metric

metric = load_metric("accuracy")

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
labels = dataset["Session1"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label["2"]

'neu'

In [9]:
import random
from IPython.display import Audio, display

for _ in range(5):
    rand_idx = random.randint(0, len(dataset["Session1"])-1)
    example = dataset["Session1"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[str(example["emotion"])]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: neu
Shape: (106520,), sampling rate: 16000



Label: hap
Shape: (106199,), sampling rate: 16000



Label: sad
Shape: (40960,), sampling rate: 16000



Label: hap
Shape: (141520,), sampling rate: 16000



Label: neu
Shape: (99240,), sampling rate: 16000





In [10]:
model_checkpoint = "facebook/hubert-base-ls960"

In [11]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [12]:
max_duration = 10.0

In [13]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
    )
    return inputs

In [14]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_dataset

DatasetDict({
    Session4: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1031
    })
    Session2: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1023
    })
    Session1: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1085
    })
    Session3: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1151
    })
    Session5: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 1241
    })
})

In [15]:
encoded_dataset = encoded_dataset.rename_column("emotion", "label")

In [16]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['projector.bias', 'classifier.bias', 'projector.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 8
args = TrainingArguments(
    f"{model_name}-finetuned-ie",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to='wandb',
    run_name='finetune_hubert_on_IEMOCAP_speech'
)

In [18]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [19]:
import torch
train_dataset = torch.utils.data.ConcatDataset([encoded_dataset['Session1'], encoded_dataset['Session2'], encoded_dataset['Session3']])

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=encoded_dataset["Session4"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

/kaggle/working/hubert-base-ls960-finetuned-ie is already a clone of https://huggingface.co/minoosh/hubert-base-ls960-finetuned-ie. Make sure you pull the latest changes with `repo.git_pull()`.


In [21]:
import torch
torch.cuda.empty_cache()

In [22]:
results = trainer.train()

***** Running training *****
  Num examples = 3259
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 765
  Number of trainable parameters = 94569604
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Epoch,Training Loss,Validation Loss,Accuracy
1,1.3145,1.288036,0.364694
2,1.137,1.149062,0.43162
3,1.0227,0.972427,0.582929
4,0.9822,0.987347,0.5645
5,0.9084,1.002899,0.5742
6,0.8217,1.027328,0.588749
7,0.779,0.977422,0.612027
8,0.7444,1.033557,0.620757
9,0.6894,0.99249,0.613967
10,0.6486,1.073283,0.604268


***** Running Evaluation *****
  Num examples = 1031
  Batch size = 16
Saving model checkpoint to hubert-base-ls960-finetuned-ie/checkpoint-51
Configuration saved in hubert-base-ls960-finetuned-ie/checkpoint-51/config.json
Model weights saved in hubert-base-ls960-finetuned-ie/checkpoint-51/pytorch_model.bin
Feature extractor saved in hubert-base-ls960-finetuned-ie/checkpoint-51/preprocessor_config.json
Feature extractor saved in hubert-base-ls960-finetuned-ie/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1031
  Batch size = 16
Saving model checkpoint to hubert-base-ls960-finetuned-ie/checkpoint-102
Configuration saved in hubert-base-ls960-finetuned-ie/checkpoint-102/config.json
Model weights saved in hubert-base-ls960-finetuned-ie/checkpoint-102/pytorch_model.bin
Feature extractor saved in hubert-base-ls960-finetuned-ie/checkpoint-102/preprocessor_config.json
Feature extractor saved in hubert-base-ls960-finetuned-ie/preprocessor_config.json
***** Running Eval

In [24]:
trainer.evaluate(encoded_dataset['Session5'])

***** Running Evaluation *****
  Num examples = 1241
  Batch size = 16


{'eval_loss': 1.0674233436584473,
 'eval_accuracy': 0.6172441579371475,
 'eval_runtime': 73.8471,
 'eval_samples_per_second': 16.805,
 'eval_steps_per_second': 1.056,
 'epoch': 15.0}

In [26]:
trainer.push_to_hub()

Saving model checkpoint to hubert-base-ls960-finetuned-ie
Configuration saved in hubert-base-ls960-finetuned-ie/config.json
Model weights saved in hubert-base-ls960-finetuned-ie/pytorch_model.bin
Feature extractor saved in hubert-base-ls960-finetuned-ie/preprocessor_config.json
remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/minoosh/hubert-base-ls960-finetuned-ie
   58b390d..c1dca0f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.6129970902036858}]}
To https://huggingface.co/minoosh/hubert-base-ls960-finetuned-ie
   c1dca0f..727eaba  main -> main



'https://huggingface.co/minoosh/hubert-base-ls960-finetuned-ie/commit/c1dca0f5b210bcfa33ad543461c34b5ca01cbd41'