In [1]:
# !git lfs install
# !git clone https://huggingface.co/datasets/parambharat/malayalam_asr_corpus

# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg

# !pip uninstall -y transformers datasets 
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install "evaluate>=0.3.0"
# !pip install jiwer
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes
# !pip install "bokeh<2.5"
# !pip install "holoviews[recommended]"

In [2]:
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_small_te.ipynb

env: WANDB_LOG_MODEL=True
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=whisper_small_te.ipynb


In [3]:
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio 
import evaluate

import torch
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset
from datasets import load_dataset, Audio
from pathlib import Path
import numpy as np
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE
hv.extension("bokeh", logo=False)

from io import StringIO
import pandas as pd
import warnings
import jiwer
warnings.filterwarnings('ignore')

clear_output()
torch.cuda.is_available()

True

In [4]:
run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="small-te", resume="must", id="28l7467b")

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668596299981193, max=1.0…

In [5]:
artifact = run.use_artifact('parambharat/whisper_finetuning/model-28l7467b:latest', type='model')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact model-28l7467b:latest, 924.00MB. 10 files... 
[34m[1mwandb[0m:   10 of 10 files downloaded.  
Done. 0:0:6.1


In [8]:
def load_data_splits(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = {}
    
    dataset_dict["train"] = load_dataset("../data/telugu_asr_corpus/", split="train", streaming=is_streaming)
    dataset_dict["test"] = load_dataset("google/fleurs", "te_in", split="test", streaming=True)
    dataset_dict["test"] = dataset_dict["test"].rename_column("transcription", "sentence")
    dataset_dict["test"] = dataset_dict["test"].remove_columns(
        [col for col in dataset_dict["test"].features.keys() if col not in ["audio", "sentence"]])
    dataset_dict["test"] = dataset_dict["test"].cast_column("audio", Audio(sampling_rate=16000))
    return dataset_dict

In [9]:
dataset_dict = load_data_splits()

In [10]:
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
    ,])

def augment_dataset(batch):

    audio = batch["audio"]["array"]
    # apply augmentation
    augmented_audio = augment_waveform(samples=audio, sample_rate=16000)

    batch["audio"]["array"] = augmented_audio

    return batch


# call augment dataset on the training set
dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)

In [11]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-small"
)
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small", 
     language="Telugu",
     task="transcribe",
     model_max_length=225
)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
     language="Telugu", 
     task="transcribe",
     model_max_length=225
)

In [12]:
def fix_sentence(sentence):
    transcription = sentence
  
    if transcription.startswith('"') and transcription.endswith('"'):
        # we can remove trailing quotation marks as they do not affect the transcription
        transcription = transcription[1:-1]
  
    if transcription[-1] not in [".", "?", "!"]:
        # append a full-stop to sentences that do not end in punctuation
        transcription = transcription + "."
    transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
    return transcription
    
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    
    sentences = fix_sentence(examples["sentence"])
    
    # encode target text to label ids 
    examples["labels"] = tokenizer(sentences, max_length=225, truncation=True).input_ids
    return examples

In [13]:
def filter_empty_strings(sentence):
    if len(sentence) < 2:
        return False
    else: return True

In [14]:
for k in dataset_dict:
    dataset_dict[k] = dataset_dict[k].filter(filter_empty_strings, input_columns=["sentence"])

In [15]:
for k in dataset_dict:
    dataset_dict[k] = dataset_dict[k].map(
        prepare_dataset,).with_format("torch")

In [16]:
dataset_dict["train"] = dataset_dict["train"].shuffle(buffer_size=500)

In [17]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
                          for feature in features]
        # pad the labels to max length
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [19]:
metric = evaluate.load("wer")

# evaluate with the 'normalised' WER
do_normalize_eval = True


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [20]:
model = WhisperForConditionalGeneration.from_pretrained(artifact_dir, use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [21]:
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
            


In [22]:
def load_samples_dataset(dataset, num_samples=100):
    samples = []
    for i, item in enumerate(dataset):
        samples.append(item)
        if i == (num_samples-1):
            break
    sample_dataset = Dataset.from_list(samples)
    return sample_dataset

def compute_spectrograms(example):
    waveform =  example["audio"]["array"]
    specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
    return {"spectrogram": specs}


def record_to_html(sample_record):
    audio_array = np.array(sample_record["audio"]["array"])
    audio_sr = sample_record["audio"]["sampling_rate"]
    length = sample_record.get("length")
    if length:
        audio_duration = length
    else:
        sample_record["length"] = audio_duration = len(audio_array)/audio_sr
    audio_duration = sample_record["length"]
    audio_spectrogram = np.array(sample_record["spectrogram"])

    bounds = (0,0, audio_duration, audio_spectrogram.max())

    waveform_int = np.int16(audio_array * 32767)

    
    
    hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
    
    slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
    line_audio = hv.VLine(0).opts(color='black')
    line_spec = hv.VLine(0).opts(color='red')
    
    
    slider.jslink(hv_audio, value='time', bidirectional=True)
    slider.jslink(line_audio, value='glyph.location')
    slider.jslink(line_spec, value='glyph.location')
    
    time = np.linspace(0, audio_duration, num=len(audio_array))
    line_plot_hv = hv.Curve(
        (time, audio_array), ["Time (s)", "amplitude"]).opts(
        width=500, height=150, axiswise=True) * line_audio
    
    hv_spec_gram = hv.Image(
        audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
        width=500, height=150, labelled=[], axiswise=True, color_levels=512)* line_spec
    
    
    combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
    audio_html = StringIO()
    combined.save(audio_html)
    sample_record["audio_with_spec"] = audio_html
    return sample_record


def dataset_to_records(dataset):
    records = []
    for item in dataset:
        record = {}
        sample_record = record_to_html(item)
        record["audio_with_spec"] = wandb.Html(sample_record["audio_with_spec"])
        record["sentence"] = sample_record["sentence"]
        record["length"] =  sample_record["length"]
        records.append(record)
    records = pd.DataFrame(records)
    return records
    
def decode_predictions(trainer, predictions):
    pred_ids = predictions.predictions
    pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
    return pred_str


def compute_measures(predictions, labels):
    measures = [jiwer.compute_measures(ls, ps) for ps, ls in zip(predictions, labels)]
    measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
    return measures_df

class WandbProgressResultsCallback(WandbCallback):
    def __init__(self, trainer, sample_dataset): 
        super().__init__()
        self.trainer = trainer
        self.sample_dataset = sample_dataset
        self.records_df = dataset_to_records(sample_dataset)
        
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        super().on_log(args, state, control, model, logs)
        predictions = trainer.predict(self.sample_dataset)
        predictions = decode_predictions(self.trainer, predictions)
        measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
        records_df = pd.concat([self.records_df, measures_df], axis=1)
        records_df["prediction"] = predictions
        records_df["step"] = state.global_step
        records_table = self._wandb.Table(dataframe=records_df)
        self._wandb.log({"sample_predictions": records_table})
        
    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if self._wandb is None:
            return
        if self._log_model and self._initialized and state.is_world_process_zero:
            with tempfile.TemporaryDirectory() as temp_dir:
                self.trainer.save_model(temp_dir)
                metadata = (
                    {
                        k: v
                        for k, v in dict(self._wandb.summary).items()
                        if isinstance(v, numbers.Number) and not k.startswith("_")
                    }
                    if not args.load_best_model_at_end
                    else {
                        f"eval/{args.metric_for_best_model}": state.best_metric,
                        "train/total_floss": state.total_flos,
                    }
                )
                artifact = self._wandb.Artifact(
                    name=f"model-{self._wandb.run.id}",
                    type="model", metadata=metadata)
                for f in Path(temp_dir).glob("*"):
                    if f.is_file():
                        with artifact.new_file(f.name, mode="wb") as fa:
                            fa.write(f.read_bytes())
                self._wandb.run.log_artifact(artifact)

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../models/whisper-small-te",  # change to a repo name of your choice
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    save_total_limit=4,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
#     fp16_full_eval=True,
    optim="adamw_bnb_8bit",
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=250,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    hub_strategy="checkpoint",
    push_to_hub=True,
    remove_unused_columns=False, 
    ignore_data_skip=True
)


In [24]:
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [25]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=samples_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
    
)

Cloning https://huggingface.co/parambharat/whisper-small-te into local empty directory.


Download file pytorch_model.bin:   0%|          | 529/922M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/922M [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [26]:
progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)
clear_output()

In [27]:
trainer.add_callback(progress_callback)

In [28]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

Configuration saved in ../models/whisper-small-te/config.json
Model weights saved in ../models/whisper-small-te/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-te/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-te/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-te/special_tokens_map.json
added tokens file saved in ../models/whisper-small-te/added_tokens.json


In [29]:
trainer.train()

***** Running training *****
  Num examples = 320000
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 241734912
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Wer
500,0.1637,0.209195,42.940604
1000,0.1459,0.202507,35.929893
1500,0.1348,0.199042,35.491723
2000,0.1309,0.197353,33.739046
2500,0.1253,0.197447,34.031159
3000,0.1209,0.19088,32.473223
3500,0.1139,0.189888,31.791626
4000,0.1043,0.18682,31.64557
4500,0.0996,0.18743,31.694255
5000,0.1002,0.186259,31.64557


***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-te/checkpoint-500
Configuration saved in ../models/whisper-small-te/checkpoint-500/config.json
Model weights saved in ../models/whisper-small-te/checkpoint-500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-te/checkpoint-500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-te/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-te/checkpoint-500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-te/checkpoint-500/added_tokens.json
Feature extractor saved in ../models/whisper-small-te/preprocessor_config.json
tokenizer config file saved in ../models/

Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/700M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-te
   fef40b5..806fb01  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-te
   806fb01..512faf1  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-te/checkpoint-1000
Configuration saved in ../models/whisper-small-te/checkpoint-1000/config.json
Model weights saved in ../models/whisper-small-te/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ../models/

Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/700M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-te
   512faf1..e44e351  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-te
   e44e351..515815b  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-te/checkpoint-1500
Configuration saved in ../models/whisper-small-te/checkpoint-1500/config.json
Model weights saved in ../models/whisper-small-te/checkpoint-1500/pytorch_model.bin
Feature extractor saved in ../models/

Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-te
   515815b..93cc0f6  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-te
   93cc0f6..bfdee18  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-te/checkpoint-2000
Configuration saved in ../models/whisper-small-te/checkpoint-2000/config.json
Model weights saved in ../models/whisper-small-te/checkpoint-2000/pytorch_model.bin
Feature extractor saved in ../models/

TrainOutput(global_step=5000, training_loss=0.12562131462097167, metrics={'train_runtime': 70959.925, 'train_samples_per_second': 4.51, 'train_steps_per_second': 0.07, 'total_flos': 9.23444422483968e+19, 'train_loss': 0.12562131462097167, 'epoch': 1.35})

In [30]:
kwargs = {
    "language": "te",
    "model_name": "Whisper Small Te - Bharat Ramanathan",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "whisper-event",
}


In [31]:
trainer.push_to_hub(**kwargs)

Saving model checkpoint to ../models/whisper-small-te
Configuration saved in ../models/whisper-small-te/config.json
Model weights saved in ../models/whisper-small-te/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-te/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-te/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-te/special_tokens_map.json
added tokens file saved in ../models/whisper-small-te/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'metrics': [{'name': 'Wer', 'type': 'wer', 'value': 31.645569620253166}]}
To https://huggingface.co/parambharat/whisper-small-te
   6ccc5b7..135797e  main -> main



In [32]:
wandb.finish()

VBox(children=(Label(value='9712.423 MB of 9712.423 MB uploaded (482.814 MB deduped)\r'), FloatProgress(value=…

0,1
eval/loss,█▆▅▄▄▂▂▁▁▁
eval/runtime,▁▃▁▄▂▁▇▂█▂
eval/samples_per_second,█▆█▅▇█▂▇▁▇
eval/steps_per_second,█▁█▁▁█▁█▁▁
eval/wer,█▄▃▂▂▂▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/learning_rate,▅██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,██▇▆▅▅▅▄▄▄▃▃▃▃▂▂▁▁▁▁
train/total_flos,▁

0,1
eval/loss,0.18626
eval/runtime,163.3261
eval/samples_per_second,0.612
eval/steps_per_second,0.024
eval/wer,31.64557
train/epoch,1.35
train/global_step,5000.0
train/learning_rate,0.0
train/loss,0.1002
train/total_flos,9.23444422483968e+19
