In [None]:
# !pip install datasets transformers huggingface_hub librosa soundfile ipywidgets wandb gradio evaluate accelerate -U

In [1]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmeghanadh27[0m ([33mbtp_sa[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# setup wandb environment variables
%env WANDB_ENTITY=meghanadh27/btp_sa
%env WANDB_PROJECT=finetune_distilhubert_iemocap_audio

env: WANDB_ENTITY=meghanadh27/btp_sa
env: WANDB_PROJECT=finetune_distilhubert_iemocap_audio


In [3]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

dataset = load_dataset("Zahra99/IEMOCAP_Audio")
dataset = concatenate_datasets([dataset["session1"], dataset["session2"], dataset["session3"], dataset["session4"], dataset["session5"]])

In [4]:
dataset.features["label"] 

ClassLabel(names=['ang', 'hap', 'neu', 'sad'], id=None)

In [5]:
# 90% train, 10% test + validation
train_test_dataset = dataset.train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_dataset['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']})

In [6]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 4424
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 553
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 554
    })
})

In [7]:
train_test_valid_dataset["train"][0]

{'audio': {'path': 'Ses01F_script02_2_F035.wav',
  'array': array([-0.00274658, -0.00366211, -0.00289917, ...,  0.00262451,
          0.0027771 ,  0.00286865]),
  'sampling_rate': 16000},
 'label': 3}

In [8]:
id2label_fn = train_test_valid_dataset["train"].features["label"].int2str
id2label_fn(train_test_valid_dataset["train"][0]["label"])

'sad'

In [9]:
import gradio as gr


def generate_audio():
    example = train_test_valid_dataset["train"].shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label_fn(example["label"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

demo.launch(debug=True)



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.




In [10]:
from transformers import AutoFeatureExtractor

model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

In [11]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [12]:
from datasets import Audio

train_test_valid_dataset = train_test_valid_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [13]:
train_test_valid_dataset["train"][0]

{'audio': {'path': 'Ses01F_script02_2_F035.wav',
  'array': array([-0.00274658, -0.00366211, -0.00289917, ...,  0.00262451,
          0.0027771 ,  0.00286865]),
  'sampling_rate': 16000},
 'label': 3}

In [14]:
import numpy as np

sample = train_test_valid_dataset["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: -2.59e-05, Variance: 0.000796


In [15]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")

print(
    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
)

inputs keys: ['input_values', 'attention_mask']
Mean: -3.57e-09, Variance: 1.0


In [16]:
max_duration = 30.0


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )
    return inputs

In [17]:
dataset_encoded = train_test_valid_dataset.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
dataset_encoded

Map:   0%|          | 0/4424 [00:00<?, ? examples/s]

Map:   0%|          | 0/553 [00:00<?, ? examples/s]

Map:   0%|          | 0/554 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 4424
    })
    validation: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 553
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 554
    })
})

In [18]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(dataset_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["1"]

'hap'

In [19]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"finetune_{model_name}_iemocap_audio",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=40,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # use_mps_device=True,
    overwrite_output_dir=True,
    push_to_hub=True,
)

In [25]:
import evaluate
import numpy as np
from datasets import load_metric

def compute_metrics(eval_preds):
#     metric = evaluate.combine(["accuracy", "recall", "precision", "f1"])
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels, average="weighted")
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
  
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [26]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3 )],
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
trainer.evaluate(eval_dataset=dataset_encoded["test"])