In [1]:
import datasets
import transformers 
import librosa #load audio files: soundfile package
import jiwer #evaluate fine tuned model using WER metric

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Prepare Data, Tokenizer and Feature Extractor

In [23]:
# load the dataset and look at its struture

from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr", data_dir="/home/ix502iv/Documents/Datasets/timit_large")
print(timit)

Using custom data configuration default-43b510b3628aa686
Found cached dataset timit_asr (/home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 1680
    })
})


In [4]:
#drop some of the columns : keeps the notebook general

timit = timit.remove_columns(["phonetic_detail","word_detail", "dialect_region","id",
"sentence_type", "speaker_id"])

Transcription of the datasets

In [5]:
from cgitb import html
from datasets import ClassLabel
import random
import pandas as pd
import IPython.display 
from IPython.display import display, HTML

# a function to display some ransdom samples of datasets

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(timit["train"].remove_columns(["file","audio"]))

Unnamed: 0,text
0,Bright sunshine shimmers on the ocean.
1,"He drove sensual patterns off, carefully shaving his long upper lip."
2,Don't ask me to carry an oily rag like that.
3,Don't ask me to carry an oily rag like that.
4,She had your dark suit in greasy wash water all year.
5,We saw eight tiny icicles below our roof.
6,Rich purchased several signed lithographs.
7,It's fun to roast marshmallows on a gas burner.
8,Don't ask me to carry an oily rag like that.
9,Bury those uniforms so they won't be found.


In [6]:
#normalizing the text only to have lowercase, and removing the special chars
import re
chars_to_ignore_regex = '[\,\?\!\-\;\:\"]'

def remove_special_chars(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

timit = timit.map(remove_special_chars)

show_random_elements(timit["train"].remove_columns(["file","audio"]))

Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-ee36d4da987ee493.arrow
Loading cached processed dataset at /home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2/cache-4654aad01660285c.arrow


Unnamed: 0,text
0,who took the kayak down the bayou
1,homemade sauerkraut is served once a week.
2,those answers will be straightforward if you think them through carefully first.
3,critical equipment needs proper maintenance.
4,movies never have enough villains.
5,his head flopped back.
6,there was no hint of a violent struggle now.
7,the preschooler couldn't verbalize her feelings about the emergency conditions.
8,doctors prescribe drugs too freely.
9,his sudden departure shocked the cast.


In [7]:
#mapping function to concatenate all transcriptions inot one long transcription
#transform the entire into a set f chars

def extract_all_chars(batch): #pass the batch: acess all transcriptions at once
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True,
remove_columns=timit.column_names["train"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
#create a union of all distict letters in the training dataset
# convert the resulting list inot an enumearated dictionary

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'g': 0,
 'i': 1,
 'x': 2,
 'h': 3,
 'd': 4,
 ' ': 5,
 'l': 6,
 's': 7,
 'z': 8,
 'n': 9,
 'q': 10,
 'a': 11,
 'e': 12,
 'o': 13,
 'm': 14,
 'f': 15,
 'p': 16,
 'b': 17,
 '.': 18,
 'r': 19,
 'y': 20,
 'w': 21,
 'k': 22,
 'c': 23,
 "'": 24,
 'v': 25,
 'u': 26,
 't': 27,
 'j': 28}

In [9]:
vocab_dict["|"] = vocab_dict[" "] #give the spce character a more visible definition
del vocab_dict[" "]

In [10]:
#add a padding token correspoding to CTC's blank token

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD"] = len(vocab_dict)
print(len(vocab_dict))

31


we have a vocabulary list that consists of 31 tokens, therefore, the linear layer that we will
add on top of the pretrained wav2vec2 checkpoint will have an output dimension of 31

In [11]:
#save the vocabas a json file

import json
with open('vocab.json','w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [12]:
# we use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class

from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]",
                word_delimiter_token="|")


In [13]:
# to re-use the just created tokenizer with the finetuned model of the notebook
repo_name = "wav2vec2-base-timit-demo-vscode"
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/ix502iv/wav2vec2-base-timit-demo-vscode/commit/ddad78f1f1ca0968e2476ec322626bf36c6485dc', commit_message='Upload tokenizer', commit_description='', oid='ddad78f1f1ca0968e2476ec322626bf36c6485dc', pr_url=None, pr_revision=None, pr_num=None)

 Create Wav2Vec2 Feature Extractor 

In [14]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000,
padding_value=0.0, do_normalize=True, return_attention_mask=False
)

In [15]:
#wrap the exractor and the tokenizer into a single processor
# so that only one needs a model and a processor object

from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
    )