In [1]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install --upgrade datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
# finetune

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from datasets import load_dataset

dataset = load_dataset("minoosh/shEMO", use_auth_token=True)



  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    test: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 300
    })
    valid: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 300
    })
    train: Dataset({
        features: ['audio', 'emotion'],
        num_rows: 2400
    })
})

In [9]:
dataset['valid']

Dataset({
    features: ['audio', 'emotion'],
    num_rows: 300
})

In [10]:
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [11]:
labels = dataset["train"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label["2"]

'N'

In [12]:
model_checkpoint = "facebook/wav2vec2-base"

In [13]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [14]:
max_duration = 5.0

In [30]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
    )
    return inputs

In [16]:
encoded_ds = dataset.map(preprocess_function, remove_columns=["audio"], batched=True)
encoded_ds



DatasetDict({
    test: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 300
    })
    valid: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 300
    })
    train: Dataset({
        features: ['emotion', 'input_values'],
        num_rows: 2400
    })
})

In [17]:
encoded_ds = encoded_ds.rename_column("emotion", "label")

In [18]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.weight', 'project_hid.weight', 'project_hid.bias', 'quantizer.codevectors', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector

In [19]:
%env WANDB_PROJECT = shEMO_SER

env: WANDB_PROJECT=shEMO_SER


In [20]:
model_name = 'wav2vec2'
batch_size = 8
args = TrainingArguments(
    f"{model_name}-finetuned-on-shEMO",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
    report_to='wandb',
    run_name='finetune_wav2vec2_on_shEMO'
)

In [21]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_ds['train'],
    eval_dataset=encoded_ds['valid'],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

/content/wav2vec2-finetuned-on-shEMO is already a clone of https://huggingface.co/minoosh/wav2vec2-finetuned-on-shEMO. Make sure you pull the latest changes with `repo.git_pull()`.


In [23]:
import torch
torch.cuda.empty_cache()

In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.4581,1.440455,0.576667
2,1.0505,0.979674,0.71
3,0.9486,0.844488,0.74
4,0.7795,0.901511,0.686667
5,0.6058,0.74162,0.776667
6,0.5169,0.756497,0.78
7,0.4251,0.642198,0.82
8,0.3567,0.52836,0.836667
9,0.2806,0.650589,0.803333
10,0.2108,0.647696,0.833333


TrainOutput(global_step=1500, training_loss=0.3780524861216545, metrics={'train_runtime': 6666.2545, 'train_samples_per_second': 7.2, 'train_steps_per_second': 0.225, 'total_flos': 2.1475813401980895e+18, 'train_loss': 0.3780524861216545, 'epoch': 20.0})

In [25]:
#trainer.train(resume_from_checkpoint=True )

In [28]:
trainer.evaluate(encoded_ds['test'])

{'eval_loss': 1.0217950344085693,
 'eval_accuracy': 0.8,
 'eval_runtime': 15.5917,
 'eval_samples_per_second': 19.241,
 'eval_steps_per_second': 2.437,
 'epoch': 20.0}

{'eval_loss': 1.0217950344085693,
 'eval_accuracy': 0.8,
 'eval_runtime': 15.7069,
 'eval_samples_per_second': 19.1,
 'eval_steps_per_second': 2.419,
 'epoch': 20.0}

In [31]:
trainer.evaluate(encoded_ds['valid'])

{'eval_loss': 0.6910018920898438,
 'eval_accuracy': 0.8666666666666667,
 'eval_runtime': 15.2951,
 'eval_samples_per_second': 19.614,
 'eval_steps_per_second': 2.484,
 'epoch': 20.0}

In [29]:
trainer.push_to_hub()

To https://huggingface.co/minoosh/wav2vec2-finetuned-on-shEMO
   7850293..a9ed066  main -> main

   7850293..a9ed066  main -> main

To https://huggingface.co/minoosh/wav2vec2-finetuned-on-shEMO
   a9ed066..b8ff3c4  main -> main

   a9ed066..b8ff3c4  main -> main



'https://huggingface.co/minoosh/wav2vec2-finetuned-on-shEMO/commit/a9ed0665078c7ae7b19f0d925f2b50099713d5a2'