In [None]:
! pip install tensorflow
# ! pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import random
import logging # allows writing status messages to a file or output streams

import numpy as np 
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)
# set random seed
tf.keras.utils.set_random_seed(42)


# tf.keras.utils.set_random_seed?

In [None]:
# Variable definition
MAX_DURATION = 1 # duration of the input audio file we feed into wav2vec
SAMPLING_RATE = 16000 # no. of samples of audio recorded every second
BATCH_SIZE = 32 # batch size for training and evaluating the model
NUM_CLASSES = 10 # classes our dataset will have
HIDDEN_DIM = 768 # Dimension of our model output (768 is for wav2vec2-base)
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE # max length of the input audio file
# 1 x 16k
MAX_FRAMES = 49
MAX_EPOCHS = 2 # maximum number of training epochs


MODEL_CHECKPOINT = "facebook/wav2vec2-base-960h"

In [None]:
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset
speech_command_v1 = load_dataset("superb", "ks")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
speech_command_v1

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 51094
    })
    validation: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 6798
    })
    test: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 3081
    })
})

In [None]:
# split test dataset by label to handle both train/test: 65/35 train vs test
speech_command_v1 = speech_command_v1['train'].train_test_split(train_size=0.65, test_size=0.35, stratify_by_column='label')

# remove the unknown and silence classes on the train dataset -> filter method
speech_command_v1 = speech_command_v1.filter(
    lambda x: x["label"]
    != (
        speech_command_v1["train"].features["label"].names.index("_silence_")
        and speech_command_v1['train'].features["label"].names.index("_unknown_")
    )
)

# sample our train and test dataset splits to a multiple of the bathc size = 2 -> select method
speech_command_v1["train"] = speech_command_v1['train'].select(
    [i for i in range((len(speech_command_v1["train"]) // BATCH_SIZE) * BATCH_SIZE)]
)

speech_command_v1["test"] = speech_command_v1['test'].select(
    [i for i in range((len(speech_command_v1["test"]) // BATCH_SIZE) * BATCH_SIZE)]
)



In [None]:
speech_command_v1

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 12032
    })
    test: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 6464
    })
})

In [None]:
labels = speech_command_v1["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

print(id2label)

{'0': 'yes', '1': 'no', '2': 'up', '3': 'down', '4': 'left', '5': 'right', '6': 'on', '7': 'off', '8': 'stop', '9': 'go', '10': '_silence_', '11': '_unknown_'}


Pre-processing

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import the neccesary libs

from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(
    MODEL_CHECKPOINT, return_attention_mask=True
)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate = feature_extractor.sampling_rate,
        max_length = MAX_SEQ_LENGTH,
        truncation = True,
        padding=True,
    )
    return inputs

processed_speech_commands_v1 = speech_command_v1.map(
    preprocess_function, remove_columns=["audio", "file"], batched=True
) # drop the audio and file column, they are not neccessary while training

# Load the entire dataset and split as a dict of np arrays
train = processed_speech_commands_v1["train"].shuffle(seed=42).with_format("numpy")[:]
test = processed_speech_commands_v1["test"].shuffle(seed=42).with_format("numpy")[:]




Wav2Vec2.0 / Classification Head

In [None]:
from transformers import TFWav2Vec2Model
# Model Definition

def mean_pool(hidden_states, feature_lengths):
    attention_mask = tf.sequence_mask(
        feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
    )

    padding_mask = tf.cast(
        tf.reverse(tf.cumsum(tf.reverse(attention_mask, [-1]), -1), [-1]),
        dtype=tf.dtypes.bool,
    )

    hidden_states = tf.where(
        tf.broadcast_to(
            tf.expand_dims(~padding_mask, -1), (BATCH_SIZE, MAX_FRAMES, HIDDEN_DIM)
        ),
        0.0,
        hidden_states,
    ) # returns the indexes of non-zero elements

    pooled_state = tf.math.reduce_sum(hidden_states, axis=1)/ tf.reshape(
        tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
        [-1, 1],
    )
    return pooled_state
    # reduce_sum -> computes the sum of all elements across dimensions of a tensor.

class TFWav2Vec2ForAudioClassification(layers.Layer):
    """
    Combines the Encoder and Decoder into an end-to-end model for training
    """

    def __init__(self, model_checkpoint, num_classes):
        super(TFWav2Vec2ForAudioClassification, self).__init__()
        # Instantiate the Wav2vec2 model w/out the Classification-head
        self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
            model_checkpoint, apply_spec_augment=False, from_pt=True
        )
        self.pooling = layers.GlobalAveragePooling1D()

        # Drop-out layer before the classification head
        self.intermediate_layer_dropout = layers.Dropout(0.5)

        # Classification Head
        self.final_layer = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        # take the first output in the returned dictionary corresponding to 
        # the output of the last layer of the Wav2Vec2
        # hidden_states = self.wav2vec2(inputs["input_values"])[0] # pops errors !!!!!!!!!!!!!!!!!!!!!!!!111
        hidden_states = self.wav2vec2(inputs["input_values"])[0]

        # if attention_mask doesn't exist then mean-pool only unmasked output frames
        if tf.is_tensor(inputs["attention_mask"]):
            # get the length of each audio input by summing up the attention mask
            audio_lengths = tf.cumsum(inputs["attention_mask"], -1)[:,-1]

            # get the no. of wav2vec2 output frames for each corresponding audio input 
            # length
            feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
                audio_lengths
            )
            pooled_state = mean_pool(hidden_states, feature_lengths)
            # if attention mask does not exist, then mean-pool only all output frames
        else:
            pooled_state = self.pooling(hidden_states)

        intermediate_state = self.intermediate_layer_dropout(pooled_state)
        final_state = self.final_layer(intermediate_state)

        return final_state

In [None]:
def build_model():
    # Model's Input
    inputs = {
        "input_values": tf.keras.Input(shape=(MAX_SEQ_LENGTH), dtype="float32"),
        "attention_mask": tf.keras.Input(shape=(MAX_SEQ_LENGTH), dtype="float32"),
    } 

    # Instantiate the Wav2Vec2 model w/Classification-Head using the desired
    # pre-trained checkpoint(wav2vec2-base-960h)
    wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT, NUM_CLASSES)(inputs)

    # Model
    model = tf.keras.Model(inputs, wav2vec2_model)
    # Loss 
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    # Optimizer
    optimizer = keras.optimizers.Adam(learning_rate=1e-5)
    # Compile and Return
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    return model

model = build_model()


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tine this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Vec2Model were not initialized from the PyTorch model and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

In [None]:
train_x = {x: y for x, y in train.items() if x != "label"}
test_x = {x: y for x, y in test.items() if x != "label"}

*before we train the model, we first of all divide the inputs into dependent and independent variables*

In [None]:
model.fit(
    train_x,
    train["label"],
    validation_data=(test_x, test["label"]),
    batch_size = BATCH_SIZE,
    epochs = 2,
)

- predict the classes for audio samples in the test set using model_predict(method)
- Epochs: for maximumand best training results, we increase the number of epochs
# Epoch:
- Another way to define an epoch is the number of passes a training dataset takes around an algorithm
- One epoch means that each sample in the training dataset has had an opportunity to update the internal model parameters once

In [None]:
preds = model.predict(test_x)