In [117]:
!pip install --upgrade pip
!pip install --upgrade datasets[audio] transformers jiwer accelerate evaluate  tensorboard gradio

[0m

In [118]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [119]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

**DATA loading**

In [120]:
from datasets import load_dataset

train_dataset = load_dataset("mozilla-foundation/common_voice_16_0", "te", split="train",use_auth_token=True)
validation_dataset = load_dataset("mozilla-foundation/common_voice_16_0", "te", split="validation",use_auth_token=True)
test_dataset = load_dataset("mozilla-foundation/common_voice_16_0", "te", split="test",use_auth_token=True)

train_dataset, validation_dataset, test_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


(Dataset({
     features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
     num_rows: 39
 }),
 Dataset({
     features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
     num_rows: 25
 }),
 Dataset({
     features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
     num_rows: 27
 }))

In [121]:
common_voice_train = train_dataset.remove_columns(["client_id", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])
common_voice_validation = validation_dataset.remove_columns(["client_id", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])
common_voice_test = test_dataset.remove_columns(["client_id", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"])

common_voice_train, common_voice_validation, common_voice_test

(Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 39
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 25
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 27
 }))

**Tokenization**

In [122]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [123]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_val = common_voice_validation.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

common_voice_train, common_voice_val, common_voice_test

(Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 39
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 25
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 27
 }))

In [124]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [125]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_val = common_voice_val.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_val.column_names)

vocab_train, vocab_val

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

(Dataset({
     features: ['vocab', 'all_text'],
     num_rows: 1
 }),
 Dataset({
     features: ['vocab', 'all_text'],
     num_rows: 1
 }))

In [126]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_val["vocab"][0]))

print(len(vocab_list), vocab_list)

51 ['ి', 'ల', 'వ', 'బ', 'ు', 'ె', 'ౌ', 'మ', 'క', 'ీ', 'శ', 'ట', 'ణ', 'య', '్', 'ై', 'భ', ' ', 'ూ', 'ఏ', 'ఇ', 'ఆ', 'గ', 'ప', 'ర', 'ఘ', 'డ', 'ొ', 'ే', 'ఒ', 'ఈ', 'ఉ', 'త', 'ఫ', 'ో', 'స', 'ధ', 'థ', 'జ', 'చ', 'ఎ', 'ఖ', 'ా', 'న', 'ద', 'అ', 'ం', 'ష', 'హ', 'ృ', 'ళ']


In [13]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
# len(vocab_dict), vocab_dict

In [128]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict), vocab_dict

(53,
 {'ి': 0,
  'ల': 1,
  'వ': 2,
  'బ': 3,
  'ు': 4,
  'ె': 5,
  'ౌ': 6,
  'మ': 7,
  'క': 8,
  'ీ': 9,
  'శ': 10,
  'ట': 11,
  'ణ': 12,
  'య': 13,
  '్': 14,
  'ై': 15,
  'భ': 16,
  'ూ': 18,
  'ఏ': 19,
  'ఇ': 20,
  'ఆ': 21,
  'గ': 22,
  'ప': 23,
  'ర': 24,
  'ఘ': 25,
  'డ': 26,
  'ొ': 27,
  'ే': 28,
  'ఒ': 29,
  'ఈ': 30,
  'ఉ': 31,
  'త': 32,
  'ఫ': 33,
  'ో': 34,
  'స': 35,
  'ధ': 36,
  'థ': 37,
  'జ': 38,
  'చ': 39,
  'ఎ': 40,
  'ఖ': 41,
  'ా': 42,
  'న': 43,
  'ద': 44,
  'అ': 45,
  'ం': 46,
  'ష': 47,
  'హ': 48,
  'ృ': 49,
  'ళ': 50,
  '|': 17,
  '[UNK]': 51,
  '[PAD]': 52})

In [35]:
import json
with open('/content/drive/MyDrive/DATA/telugu_data/vocab.json', 'w') as f:
    json.dump(vocab_dict, f)

In [52]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("/content/drive/MyDrive/DATA/telugu_data/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [53]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor


feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [54]:
repo_name = "/content/drive/MyDrive/Models/wav2vec2-large-comon-voice"

In [56]:
from transformers import Wav2Vec2ForCTC

model_path = "facebook/wav2vec2-large-xlsr-53"

model = Wav2Vec2ForCTC.from_pretrained(
    model_path,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(vocab_dict)
)
model

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

**Resampling**

In [129]:
from datasets import Audio

common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16000))
common_voice_validation = common_voice_validation.cast_column("audio", Audio(sampling_rate=16000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))


common_voice_train, common_voice_validation, common_voice_test

(Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 39
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 25
 }),
 Dataset({
     features: ['path', 'audio', 'sentence'],
     num_rows: 27
 }))

In [130]:
common_voice_train[0], common_voice_validation[0], common_voice_test[0]

({'path': '/root/.cache/huggingface/datasets/downloads/extracted/c073aaafe00182259194e9c3a3b0e39885c14d234aa28d480035630a02d4ecc9/te_train_0/common_voice_te_38821716.mp3',
  'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/c073aaafe00182259194e9c3a3b0e39885c14d234aa28d480035630a02d4ecc9/te_train_0/common_voice_te_38821716.mp3',
   'array': array([ 2.54658516e-11,  9.45874490e-11,  8.00355338e-11, ...,
          -1.11347967e-04, -1.15115618e-04, -7.19922391e-05]),
   'sampling_rate': 16000},
  'sentence': 'గురువారం నాకు ఏ పని లేదు '},
 {'path': '/root/.cache/huggingface/datasets/downloads/extracted/287d613bd459b0958070309306be90f8e48f8c7b507512353183f8f8ab67cb12/te_dev_0/common_voice_te_39104590.mp3',
  'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/287d613bd459b0958070309306be90f8e48f8c7b507512353183f8f8ab67cb12/te_dev_0/common_voice_te_39104590.mp3',
   'array': array([ 6.50521303e-19, -1.08420217e-18, -2.71050543e-18, ...,
           

**Prepare dataset**

In [131]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [132]:
train_data = common_voice_train.map(prepare_dataset, num_proc=4, remove_columns=common_voice_train.column_names)

val_data = common_voice_validation.map(prepare_dataset, num_proc=4, remove_columns=common_voice_train.column_names)
train_data, val_data

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/39 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/25 [00:00<?, ? examples/s]



(Dataset({
     features: ['input_values', 'input_length', 'labels'],
     num_rows: 39
 }),
 Dataset({
     features: ['input_values', 'input_length', 'labels'],
     num_rows: 25
 }))

**Training**

In [94]:
repo_name = "/content/drive/MyDrive/Models/wav2vec2-large-comon-voice_200"

In [96]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [97]:
from datasets import load_metric
import numpy as np

wer_metric = load_metric("wer", trust_remote_code=True)

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [98]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [99]:
model.freeze_feature_extractor()



In [100]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=4,
  evaluation_strategy="steps",
  num_train_epochs=200,
  # fp16=False,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2)

In [101]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=processor.feature_extractor)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [102]:
#200 epochs
trainer.train()



Step,Training Loss,Validation Loss,Wer
500,0.6096,2.000157,1.04902
1000,0.241,2.386142,1.058824
1500,0.1131,2.587018,1.029412
2000,0.065,2.375171,1.019608




TrainOutput(global_step=2000, training_loss=0.25720303535461425, metrics={'train_runtime': 2168.2642, 'train_samples_per_second': 3.597, 'train_steps_per_second': 0.922, 'total_flos': 1.1329099015118573e+18, 'train_loss': 0.25720303535461425, 'epoch': 200.0})

In [103]:
model.save_pretrained(repo_name)
processor.save_pretrained(repo_name)

[]

In [82]:
#100 epochs
trainer.train()



Step,Training Loss,Validation Loss,Wer
500,3.3,3.533207,1.0
1000,2.4079,1.964025,1.0




TrainOutput(global_step=1000, training_loss=2.853957275390625, metrics={'train_runtime': 1034.527, 'train_samples_per_second': 3.77, 'train_steps_per_second': 0.967, 'total_flos': 5.661720064973952e+17, 'train_loss': 2.853957275390625, 'epoch': 100.0})

In [83]:
model.save_pretrained(repo_name)
processor.save_pretrained(repo_name)

[]

**Inference**

In [133]:
fine_model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Models/wav2vec2-large-comon-voice_200/")

fine_processor = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/Models/wav2vec2-large-comon-voice_200/")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [134]:
fine_model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [135]:
common_voice_test

Dataset({
    features: ['path', 'audio', 'sentence'],
    num_rows: 27
})

In [136]:
common_voice_test[0]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111379.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111379.mp3',
  'array': array([0.00000000e+00, 7.56699592e-10, 8.14907253e-10, ...,
         9.90435365e-06, 2.74122840e-06, 3.23535551e-06]),
  'sampling_rate': 16000},
 'sentence': 'కలసపాడు బ్రిడ్జిపై నుంచి నీరు ప్రవహించింది '}

In [137]:
common_voice_test[1]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111372.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111372.mp3',
  'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.02688391e-05, 1.17817272e-05, 3.56075475e-06]),
  'sampling_rate': 16000},
 'sentence': 'మార్కెట్ యార్డులోని గోదాములో భద్రపరిచిన మిర్చి బస్తాలు అగ్ని ప్రమాదంలో కాలిపోవడంతో చాల నష్టం వాటిల్లింది '}

In [138]:
from transformers import AutoModelForCTC, Wav2Vec2Processor, pipeline
import torch, torchaudio


asr = pipeline("automatic-speech-recognition",
               model=fine_model,
               tokenizer=fine_processor.tokenizer,
               feature_extractor=fine_processor.feature_extractor,
               max_new_tokens=128,
               chunk_length_s=15,
               batch_size=4,
               device=device
               )

In [140]:
TARGET_SAMPLING_RATE = 16000
audio, sampling_rate = torchaudio.load("/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111379.mp3")
audio = torchaudio.functional.resample(audio, sampling_rate, TARGET_SAMPLING_RATE)[0]
asr(audio.numpy())

{'text': 'కలసు పోడు బరుచ్ెిపయనించి నీరుప్వించింది'}

In [141]:
TARGET_SAMPLING_RATE = 16000
audio, sampling_rate = torchaudio.load("/root/.cache/huggingface/datasets/downloads/extracted/775ea32a99d0ea9e5a1dfc95a089ac370150db9ba2986f68391ffe4b63a83ddb/te_test_0/common_voice_te_39111372.mp3")
audio = torchaudio.functional.resample(audio, sampling_rate, TARGET_SAMPLING_RATE)[0]
asr(audio.numpy())

{'text': 'మార్కట్టి ాడులోను గోదముల వద్రపరచిన మిచివస్తలోద్నిత్మతనుకలరు్పడంతు చెరన ష్టం్నర్టిలింది'}