In [30]:
import datasets
import transformers 
import librosa #load audio files: soundfile package
import jiwer #evaluate fine tuned model using WER metric

In [31]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Prepare Data, Tokenizer and Feature Extractor

In [32]:
# load the dataset and look at its struture

from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr", data_dir="/home/ix502iv/Documents/Datasets/timit_large")
print(timit)

Using custom data configuration default-43b510b3628aa686
Found cached dataset timit_asr (/home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 1680
    })
})


In [33]:
#drop some of the columns : keeps the notebook general

timit = timit.remove_columns(["phonetic_detail","word_detail", "dialect_region","id",
"sentence_type", "speaker_id"])

Transcription of the datasets

In [34]:
from cgitb import html
from datasets import ClassLabel
import random
import pandas as pd
import IPython.display 
from IPython.display import display, HTML

# a function to display some ransdom samples of datasets

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(timit["train"].remove_columns(["file","audio"]))

Unnamed: 0,text
0,Spherical gifts are difficult to wrap.
1,Or maybe you just don't feel like a cigar?
2,Don't ask me to carry an oily rag like that.
3,Regular attendance is seldom required.
4,You came well equipped to die.
5,"Then came coconuts, eggs, and rice wine."
6,These curves were derived by an analysis of extensive skywave measurement data.
7,"Thank you, she said, dusting herself off."
8,Toss a die until an ace appears.
9,"She saw me and sat down beside me, three feet away."


In [35]:
#normalizing the text only to have lowercase, and removing the special chars
import re
chars_to_ignore_regex = '[\,\?\!\-\;\:\"]'

def remove_special_chars(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

timit = timit.map(remove_special_chars)

show_random_elements(timit["train"].remove_columns(["file","audio"]))

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-ee36d4da987ee493.arrow
Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-4654aad01660285c.arrow


Unnamed: 0,text
0,the rose corsage smelled sweet.
1,the bacteria formed typical activated sludge floc.
2,or maybe you just don't feel like a cigar
3,while waiting for chipper she crisscrossed the square many times.
4,don't ask me to carry an oily rag like that.
5,his suresure was enthusiastic this time.
6,that stinging vapor was caused by chloride vaporization.
7,remember to allow identical twins to enter freely.
8,we're not drunkards she said.
9,rich looked for spotted hyenas and jaguars on the safari.


In [36]:
#mapping function to concatenate all transcriptions inot one long transcription
#transform the entire into a set f chars

def extract_all_chars(batch): #pass the batch: acess all transcriptions at once
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
remove_columns=timit.column_names["train"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [37]:
#create a union of all distict letters in the training dataset
# convert the resulting list inot an enumearated dictionary

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'r': 0,
 'i': 1,
 'a': 2,
 '.': 3,
 "'": 4,
 'b': 5,
 ' ': 6,
 'e': 7,
 'n': 8,
 'x': 9,
 'w': 10,
 'm': 11,
 'f': 12,
 'v': 13,
 'u': 14,
 'j': 15,
 'c': 16,
 'q': 17,
 's': 18,
 'g': 19,
 'k': 20,
 'p': 21,
 'h': 22,
 'l': 23,
 'd': 24,
 't': 25,
 'y': 26,
 'z': 27,
 'o': 28}

In [38]:
vocab_dict["|"] = vocab_dict[" "] #give the spce character a more visible definition
del vocab_dict[" "]

In [39]:
#add a padding token correspoding to CTC's blank token

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD"] = len(vocab_dict)
print(len(vocab_dict))

31


we have a vocabulary list that consists of 31 tokens, therefore, the linear layer that we will
add on top of the pretrained wav2vec2 checkpoint will have an output dimension of 31

In [40]:
#save the vocabas a json file

import json
with open('vocab.json','w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [41]:
# we use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class

from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]",
                word_delimiter_token="|")


In [42]:
# to re-use the just created tokenizer with the finetuned model of the notebook
repo_name = "wav2vec2-base-timit-demo-vscode"
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/ix502iv/wav2vec2-base-timit-demo-vscode/commit/6e1580a69cd80686c5e31a5f6e1e29d4dbcc8511', commit_message='Upload tokenizer', commit_description='', oid='6e1580a69cd80686c5e31a5f6e1e29d4dbcc8511', pr_url=None, pr_revision=None, pr_num=None)

 Create Wav2Vec2 Feature Extractor 

In [43]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000,
padding_value=0.0, do_normalize=True, return_attention_mask=False
)

In [44]:
#wrap the exractor and the tokenizer into a single processor
# so that only one needs a model and a processor object

from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
    )

In [45]:
#printing the shape of the speech input, its transcription and the corresponding sampling rate
import numpy as np

rand_int = random.randint(0, len(timit["train"]))

print("Target Text: ", timit["train"][rand_int]["text"])
print("Input array shape: ", np.asarray(timit["train"][rand_int]["audio"]["array"]).shape)
print("Sampling Rate: ", timit["train"][rand_int]["audio"]["sampling_rate"])

Target Text:  move the garbage nearer to the large window.
Input array shape:  (42701,)
Sampling Rate:  16000


from the result above:
    the target text is normalized
    the sampling rate in all corresponds to 16000

Make the Data ready for our machine learning model

In [46]:
# we make use of the map function to resample the entire dataset
# extract the input_values from the loaded audio file
# wav...processor -> normalizes the data

def prepare_dataset(batch):
    audio = batch["audio"]

    #batched output is "un-batched" -> ensures mapping is done correctly
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [47]:
# applying data preparation function to all examples
timit = timit.map(prepare_dataset, remove_columns=timit.column_names["train"], num_proc=4)

     

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-6723818cb8a1b27a.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-26bbb6a60ea2d22d.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-a914c28a7cb5e8de.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-d6ef3a84d8d3b871.arrow


     

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-a744bef9a8be6adb.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-694599b29af747f7.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-19899890d8949e43.arrow


 

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-a7dc281d1dd0541d.arrow


Training and Evaluation

In [48]:
# define a data colletor -> collect and combine
# setup and monitor an evaluation metric
# load a pretrained checkpoint
# define the training configuration

import torch

from dataclasses import dataclass,field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator dynamically pads the inputs received.

    Args:
        processor (:class: '~transformers.Wav2Vec2Processor')
        the processor used on the data
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None


    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different
        # typs of padding methods

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of = self.pad_to_multiple_of,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        #replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


#initialzing the data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
# defining the evaluation metric
wer_metric = load_metric("wer")

In [49]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    #since we do not want ot group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_token=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [50]:
#load the pretrained wav2vec2 checkpoint

from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction = "mean",
    pad_token_id=processor.tokenizer.pad_token_id
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.weight', 'project_q.bias', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'project_hid.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [54]:
#set the params of feature extraction to False _ since the first part of the wav2vec2 cnn has been
# sufficiently trained

model.freeze_feature_extractor()
#model.freeze_feature_encoder()



In [55]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True, 
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [60]:
import transformers
from transformers import Trainer

trainer = Trainer(
    model = model,
    data_collator=data_collator,
    args = training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit["train"],
    eval_dataset=timit["test"],
    tokenizer=processor.feature_extractor,
)

In [61]:
trainer.train()

***** Running training *****
  Num examples = 4620
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4350


  0%|          | 0/4350 [00:00<?, ?it/s]



: 

: 