In [1]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from scipy.signal import resample
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader


from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

import evaluate
import re

from datasets import load_dataset,  Audio
wer  = evaluate.load('wer')


def down_sample_audio(audio_original, original_sample_rate):
    target_sample_rate = 16000

    # Calculate the number of samples for the target sample rate
    num_samples = int(len(audio_original) * target_sample_rate / original_sample_rate)

    # Resample the audio array to the target sample rate
    downsampled_audio = resample(audio_original, num_samples)

    return downsampled_audio

In [2]:
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small",language='bengali',task='translate')
# feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small",language='bengali',task='translate')
# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to('cuda')

In [3]:
asr_dataset_train = load_dataset("Mohan-diffuser/odia-english-ASR", split="train+test")
asr_dataset_test = load_dataset("Mohan-diffuser/odia-english-ASR", split="validation")

In [4]:
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\»\«]'
non_latin_characters_to_remove = '[^a-z0-9 ]'
def remove_special_characters(batch):
    # remove special characters
    batch['eng_translation'] = re.sub(chars_to_remove_regex, '', batch['eng_translation']).lower()
    batch['eng_translation'] = re.sub(non_latin_characters_to_remove, '', batch['eng_translation']).lower()

    return batch


In [5]:
asr_dataset_train = asr_dataset_train.map(remove_special_characters)
asr_dataset_test = asr_dataset_test.map(remove_special_characters)

In [6]:
def extract_all_chars(batch):
  all_text = " ".join(batch["eng_translation"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = asr_dataset_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=asr_dataset_train.column_names)
vocab_test = asr_dataset_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=asr_dataset_test.column_names)


Map:   0%|          | 0/1964 [00:00<?, ? examples/s]

Map:   0%|          | 0/392 [00:00<?, ? examples/s]

In [7]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 'a': 11,
 'b': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'f': 16,
 'g': 17,
 'h': 18,
 'i': 19,
 'j': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'o': 25,
 'p': 26,
 'q': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'x': 34,
 'y': 35,
 'z': 36}

In [8]:
'''To make it clearer that " " has its own token class, we give it a more visible character |. 
In addition, we also add an "unknown" token so that the model can later deal with characters 
not encountered in Common Voice's training set.'''

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [9]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

39

In [10]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [11]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [12]:
repo_name = "w2v-bert-odia-to-eng"
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng/commit/c2fe9cc17e1adf7aa9e8537594d38d53e814144f', commit_message='Upload tokenizer', commit_description='', oid='c2fe9cc17e1adf7aa9e8537594d38d53e814144f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng', endpoint='https://huggingface.co', repo_type='model', repo_id='Mohan-diffuser/w2v-bert-odia-to-eng'), pr_revision=None, pr_num=None)

In [13]:
from transformers import SeamlessM4TFeatureExtractor

feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")


In [14]:
from transformers import Wav2Vec2BertProcessor

processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.push_to_hub(repo_name)


CommitInfo(commit_url='https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng/commit/f203af551f55f430a69f7b57d47b22b75e241c34', commit_message='Upload processor', commit_description='', oid='f203af551f55f430a69f7b57d47b22b75e241c34', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng', endpoint='https://huggingface.co', repo_type='model', repo_id='Mohan-diffuser/w2v-bert-odia-to-eng'), pr_revision=None, pr_num=None)

In [15]:
asr_dataset_train = asr_dataset_train.cast_column("audio", Audio(sampling_rate=16_000))
asr_dataset_test = asr_dataset_test.cast_column("audio", Audio(sampling_rate=16_000))


In [16]:
for sample in asr_dataset_train:
    if sample['audio']['sampling_rate'] != 16000:
        print(sample['audio']['sampling_rate'])

In [17]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(asr_dataset_train)-1)

print(asr_dataset_train[rand_int]["eng_translation"])
ipd.Audio(data=asr_dataset_train[rand_int]["audio"]["array"], autoplay=True, rate=16000)

this contradicts earlier reports which stated that cancelling the elections would have been against the constitution


In [18]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])

    batch["labels"] = processor(text=batch["eng_translation"]).input_ids
    return batch

In [19]:
asr_dataset_train = asr_dataset_train.map(prepare_dataset, remove_columns=asr_dataset_train.column_names)
asr_dataset_test = asr_dataset_test.map(prepare_dataset, remove_columns=asr_dataset_test.column_names)

In [20]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2BertProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [21]:
import evaluate

wer_metric  = evaluate.load('wer')

In [22]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [23]:
from transformers import Wav2Vec2BertForCTC

model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)


Some weights of Wav2Vec2BertForCTC were not initialized from the model checkpoint at facebook/w2v-bert-2.0 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2_bert.adapter.layers.0.ffn.intermediate_dense.bias', 'wav2vec2_bert.adapter.layers.0.ffn.intermediate_dense.weight', 'wav2vec2_bert.adapter.layers.0.ffn.output_dense.bias', 'wav2vec2_bert.adapter.layers.0.ffn.output_dense.weight', 'wav2vec2_bert.adapter.layers.0.ffn_layer_norm.bias', 'wav2vec2_bert.adapter.layers.0.ffn_layer_norm.weight', 'wav2vec2_bert.adapter.layers.0.residual_conv.bias', 'wav2vec2_bert.adapter.layers.0.residual_conv.weight', 'wav2vec2_bert.adapter.layers.0.residual_layer_norm.bias', 'wav2vec2_bert.adapter.layers.0.residual_layer_norm.weight', 'wav2vec2_bert.adapter.layers.0.self_attn.linear_k.bias', 'wav2vec2_bert.adapter.layers.0.self_attn.linear_k.weight', 'wav2vec2_bert.adapter.layers.0.self_attn.linear_out.bias', 'wav2vec2_bert.adapter.layers.0.self_attn.linear_out.weight', 'wav2vec2_ber

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=4,
  gradient_accumulation_steps=4,
  eval_strategy="steps",
  num_train_epochs=10,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=300,
  eval_steps=300,
  logging_steps=300,
  learning_rate=5e-5,
  warmup_steps=500,
  save_total_limit=2,
  push_to_hub=True,
)


In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=asr_dataset_train,
    eval_dataset=asr_dataset_test,
    tokenizer=processor.feature_extractor,
)


  trainer = Trainer(


In [26]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmohandash96[0m ([33mmohandash96-gsegsgsg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Wer
300,3.2559,2.727211,1.023348
600,2.7219,2.710443,1.208216
900,2.6883,2.694229,1.18002


KeyboardInterrupt: 

In [27]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng/commit/0325c4d919e467b78e7b3ffde5a3fd389a507fcd', commit_message='End of training', commit_description='', oid='0325c4d919e467b78e7b3ffde5a3fd389a507fcd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mohan-diffuser/w2v-bert-odia-to-eng', endpoint='https://huggingface.co', repo_type='model', repo_id='Mohan-diffuser/w2v-bert-odia-to-eng'), pr_revision=None, pr_num=None)

# Evaluation

In [33]:
model = Wav2Vec2BertForCTC.from_pretrained(repo_name).to("cuda")
processor = Wav2Vec2BertProcessor.from_pretrained(repo_name)

In [34]:
sample = asr_dataset_test[0]
input_features = torch.tensor(sample["input_features"]).to("cuda").unsqueeze(0)

with torch.no_grad():
    logits = model(input_features).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [35]:
print(processor.decode(pred_ids))
print(processor.decode(sample['labels']).lower())

e a a a e o a a a a a a e a o e a a a o e a a o e a e
animals such as elephants and girafes have a tendency to come up to take a closer lok at cars and standard equipment that loks god
