In [1]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("❌ GPU is not available. Using CPU instead.")

✅ GPU is available!
Using GPU: NVIDIA L40S


In [2]:
%%capture
!pip install transformers
!pip install librosa
!pip install jiwer
!pip install evaluate
!pip install wandb
!pip install numpy==1.23.5
!pip install scipy==1.11.4
!pip install librosa==0.10.1
!pip install numba==0.58.1
!pip install datasets>=2.14.0
!pip install accelerate>=0.26.0
!pip install typing_extensions --upgrade

In [3]:
%%capture
!pip install --upgrade torch transformers accelerate

In [4]:
# !pip install huggingface_hub --quiet


from huggingface_hub import login
login(token="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")

In [5]:
%%capture
!apt install git-lfs

In [6]:
import shutil

# Check space where the container is running (usually '/')
total, used, free = shutil.disk_usage("/") # use /nvme and not /

print(f"Total Space: {total // (2**30)} GB")
print(f"Used Space:  {used // (2**30)} GB")
print(f"Free Space:  {free // (2**30)} GB")

Total Space: 548 GB
Used Space:  93 GB
Free Space:  427 GB


In [7]:
from datasets import load_dataset, concatenate_datasets, Audio

ds = load_dataset("kaarthu2003/SlrCvVoicesTtsDataset")
train_dataset = ds["train"]
val_dataset = ds["validation"]

In [8]:
# Print confirmation
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")

# Sample peek
print("\nSample example:")
print(train_dataset[0])

Train size: 15811
Validation size: 1610

Sample example:
{'audio': <datasets.features._torchcodec.AudioDecoder object at 0x7980e375ec60>, 'sentence': 'దాగుడుమూతల ఆట వల్ల'}


In [9]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(train_dataset.remove_columns(["audio"]))

Unnamed: 0,sentence
0,చాలామంది ఆటల కంటే
1,విజయవాడ బస్ స్టాండ్
2,పాఠాలు వినలేడు
3,చింతపండు కాల్ కిలో
4,దీనిలో ఏడు కాండాలున్నాయి
5,వాళ్ళ ఇంటి దగ్గరకి
6,పెద్ద పెద్ద బండలను తొలిచి ధ్వజస్తంభం ఆకృతులుగా మలుస్తారు
7,తెల్లగా ఉన్నాయి మేడం
8,చిన్న పద్యమప్ప జెప్పలేడు
9,ఓకే ఓకే చెప్పండి


In [10]:
telugu_special_unwanted_characters = [
    'ఁ',  # Chandrabindu
    'ౄ',  # Vocalic RR
    'ౢ',  # Vocalic L
    'ౣ',  # Vocalic LL
    'ౠ',  # Long Vocalic RR
    'ఽ',  # Avagraha
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',  # Telugu digits
    'ఀ',  # Telugu Sign Combining Candrabindu Above
    'ౘ',  # Letter TTHA
    'ౙ',  # Letter DDA
    'ౚ',  # Letter RHA
    '౷',  # Vedic Tone
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',  # Common punctuation
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n' #Unwanted in the dataset
]

In [11]:
import re
chars_to_remove_regex = f'[{re.escape("".join(telugu_special_unwanted_characters))}]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"])
    return batch

In [12]:
train_dataset = train_dataset.map(remove_special_characters)
val_dataset = val_dataset.map(remove_special_characters)

In [13]:
show_random_elements(train_dataset.remove_columns(["audio"]))

Unnamed: 0,sentence
0,పాటగాడికి పెద్ద సమస్యే పట్టుకున్నది కొయ్యకత్తితో మనిషితల ఎలా నరకటం
1,కలము పట్టగానె కవిశేఖరుడు గాడు
2,త్యాంక్ యూ మ్యామ్
3,ప్రిన్సిపాల్ ఒక మొమెంటోతో ప్రవేశించారు
4,పరిసరాలను శుభ్రంగా ఉంచుచున్నారు
5,సమాజానికి వివిధ రకాలుగా
6,హలో నమస్తే మ్యాడమ్
7,ఫెసిలిటీస్ ఏమేమి ఉంటాయండి
8,మంచిర్యాల జోగులాంబ గద్వాల్
9,వర్గ వివరణ పేజీలో క్రింది వివరణ ఉండాలి


In [14]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)
vocab_test = val_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/15811 [00:00<?, ? examples/s]

Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

In [15]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

In [16]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [17]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

69

In [18]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [19]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", clean_up_tokenization_spaces=False)

In [20]:
repo_name = "wav2vec2-IEEEAccess-FinalRun-4Datasets"

In [21]:
tokenizer.push_to_hub(repo_name)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets/commit/d1c103c560214000c1c0fa67f60d0c9d847abb8b', commit_message='Upload tokenizer', commit_description='', oid='d1c103c560214000c1c0fa67f60d0c9d847abb8b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets'), pr_revision=None, pr_num=None)

In [22]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [23]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [24]:
train_dataset[0]["audio"]

<datasets.features._torchcodec.AudioDecoder at 0x79808c2ee5a0>

In [25]:
from datasets import Audio
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16_000))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [26]:
rand_int = random.randint(0, len(train_dataset))

print("Target text:", train_dataset[rand_int]["sentence"])
print("Input array shape:", train_dataset[rand_int]["audio"]["array"].shape)
print("Sampling rate:", train_dataset[rand_int]["audio"]["sampling_rate"])

Target text: మరికొద్దిసేపటికల్లా కోతులరూపంలో కవలసోదరులు తిరుగుతున్న చోటికి గెడ్డపువాడు వచ్చి ప్రత్యక్షమయ్యాడు
Input array shape: (155689,)
Sampling rate: 16000


In [27]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    batch["labels"] = processor(text=batch["sentence"]).input_ids

    return batch

In [28]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names, num_proc = 4)
val_dataset = val_dataset.map(prepare_dataset, remove_columns=val_dataset.column_names, num_proc = 4)

In [29]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [30]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [31]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [32]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

In [33]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    'facebook/wav2vec2-large-xlsr-53',
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.freeze_feature_encoder()

In [35]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  eval_strategy="steps",
  num_train_epochs=20,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=1600,
  eval_steps=1600,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_ratio=0.1,
  save_total_limit=2,
  report_to="wandb",
  push_to_hub=True,
)

In [36]:
from torch.optim import RMSprop
optimizer = RMSprop(model.parameters(), lr=3e-4, alpha=0.99, eps=1e-8)

from transformers import Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=processor.feature_extractor,
    optimizers=(optimizer, None),
)

In [37]:
trainer.train()

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

  3


[34m[1mwandb[0m: You chose "Don't visualize my results"




Step,Training Loss,Validation Loss,Wer,Cer
1600,0.993,0.467869,0.498056,0.118765
3200,0.6605,0.422302,0.385551,0.092312
4800,0.4628,0.403441,0.369873,0.085128
6400,0.3398,0.422895,0.335758,0.078917
8000,0.2502,0.496672,0.32886,0.077148
9600,0.1988,0.511337,0.314311,0.074122




TrainOutput(global_step=9900, training_loss=0.7719459988372495, metrics={'train_runtime': 7700.0757, 'train_samples_per_second': 41.067, 'train_steps_per_second': 1.286, 'total_flos': 2.928353847999346e+19, 'train_loss': 0.7719459988372495, 'epoch': 20.0})

In [38]:
trainer.push_to_hub()

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets/commit/0c3bf74845fb51fab886972953d30a7cebd21419', commit_message='End of training', commit_description='', oid='0c3bf74845fb51fab886972953d30a7cebd21419', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets'), pr_revision=None, pr_num=None)

In [39]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets")
model = AutoModelForCTC.from_pretrained("kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets")

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/886 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [44]:
import torch

# Move the model to the GPU
model.to("cuda")

def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    # Now the model and input_values are on the same device
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

results = val_dataset.map(map_to_result, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

In [45]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.310


In [42]:
import evaluate
cer_metric = evaluate.load("cer")

In [43]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test CER: 0.074
