In [1]:
languages = ["cs-CZ", "de-DE", "en-AU", "en-GB", "en-US", "es-ES", "fr-FR", "it-IT", "ko-KR", "nl-NL", "pl-PL", "pt-PT", "ru-RU", "zh-CN"]

In [2]:
%%capture
!pip install --upgrade datasets
!pip install --upgrade transformers
!pip install --upgrade torchaudio
!pip install --upgrade accelerate
!pip install evaluate

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset

# to download all data for multi-lingual fine-tuning uncomment following line
minds_14 = load_dataset("PolyAI/minds14", "all", split="train")
minds_14

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 8168
})

In [5]:
minds = minds_14.train_test_split(test_size=0.2)
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 6534
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 1634
    })
})

In [6]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "intent_class"])
minds["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/pt-PT~LATEST_TRANSACTIONS/CA0d875f8c4422ed896e18d84f199edb48_1.wav',
  'array': array([ 0.00024414,  0.00024414,  0.        , ..., -0.00268555,
         -0.00366211,  0.00549316]),
  'sampling_rate': 8000},
 'lang_id': 11}

In [7]:
labels = minds["train"].features["lang_id"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-xls-r-300m")

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

In [9]:
from datasets import Audio

minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/pt-PT~LATEST_TRANSACTIONS/CA0d875f8c4422ed896e18d84f199edb48_1.wav',
  'array': array([0.00024588, 0.0003087 , 0.00024219, ..., 0.00082102, 0.00416982,
         0.00458909]),
  'sampling_rate': 16000},
 'lang_id': 11}

In [10]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [11]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("lang_id", "label")

Map:   0%|          | 0/6534 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", num_labels=num_labels, label2id=label2id, id2label=id2label
)

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
!pip install accelerate -U

Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [23]:
!pip install git+https://github.com/huggingface/accelerate

Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0.dev0


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", "all", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
# Get a random audio file
index = np.random.randint(len(dataset))
audio_file = dataset[index]["audio"]["path"]
actual_label = dataset[index]["lang_id"]

In [None]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="mahdouch9165/my_awesome_minds_model")
classifier(audio_file)

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("mahdouch9165/my_awesome_minds_model")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("mahdouch9165/my_awesome_minds_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
import torch

predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
print(f"Predicted label: {predicted_label}")
print(f"Actual label: {actual_label}")