In [1]:
# %%capture

# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg

# !pip uninstall -y transformers datasets 
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install evaluate>=0.3.0
# !pip install jiwer
# !pip install gradio
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes

In [2]:
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_base_ta

env: WANDB_LOG_MODEL=True
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=whisper_base_ta


In [3]:
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio 
import evaluate

import torch
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset
from datasets import load_dataset, Audio
from pathlib import Path
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE


from io import StringIO
import pandas as pd
import warnings
import jiwer
warnings.filterwarnings('ignore')
hv.extension("bokeh", logo=False)

clear_output()
torch.cuda.is_available()

True

In [4]:
# wandb.login()

In [5]:
# notebook_login()

In [6]:
# run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="tiny" )
# run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="tiny", resume="must", id="2k10w4qq" )

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666903588338755, max=1.0)…

In [7]:
def load_data_splits(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = {}
    data_dict = load_dataset("datasets/tamil_asr_corpus/", streaming=is_streaming)
        
    return data_dict

In [8]:
dataset_dict = load_data_splits()

In [9]:
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.2),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.2, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.2)
    ,])

def augment_dataset(batch):

    audio = batch["audio"]["array"]
    # apply augmentation
    augmented_audio = augment_waveform(samples=audio, sample_rate=16000)

    batch["audio"]["array"] = augmented_audio

    return batch


# call augment dataset on the training set
dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)

In [10]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-tiny"
)
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-tiny", 
     language="Tamil",
     task="transcribe",
     model_max_length=225
)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny",
     language="Tamil", 
     task="transcribe",
     model_max_length=225
)

In [11]:
def fix_sentence(sentence):
    transcription = sentence
  
    if transcription.startswith('"') and transcription.endswith('"'):
        # we can remove trailing quotation marks as they do not affect the transcription
        transcription = transcription[1:-1]
  
    if transcription[-1] not in [".", "?", "!"]:
        # append a full-stop to sentences that do not end in punctuation
        transcription = transcription + "."
    transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
    return transcription
    
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    
    sentences = fix_sentence(examples["sentence"])
    
    # encode target text to label ids 
    examples["labels"] = tokenizer(sentences, max_length=225, truncation=True).input_ids
    return examples

In [12]:
for k in dataset_dict:
    dataset_dict[k] = dataset_dict[k].map(
        prepare_dataset,).with_format("torch")

In [13]:
dataset_dict["train"] = dataset_dict["train"].shuffle(buffer_size=500)

In [14]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
                          for feature in features]
        # pad the labels to max length
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [15]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [16]:
metric = evaluate.load("wer")

# evaluate with the 'normalised' WER
do_normalize_eval = True


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [17]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [18]:
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
            


In [19]:
def load_samples_dataset(dataset, num_samples=10):
    samples = []
    for i, item in enumerate(dataset):
        samples.append(item)
        if i == (num_samples-1):
            break
    sample_dataset = Dataset.from_list(samples)
    return sample_dataset

def compute_spectrograms(example):
    waveform =  example["audio"]["array"]
    specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
    return {"spectrogram": specs}


def record_to_html(sample_record):
    audio_array = np.array(sample_record["audio"]["array"])
    audio_sr = sample_record["audio"]["sampling_rate"]
    audio_duration = sample_record["length"]
    audio_spectrogram = np.array(sample_record["spectrogram"])

    bounds = (0,0, audio_duration, audio_spectrogram.max())

    waveform_int = np.int16(audio_array * 32767)

    
    
    hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
    
    slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
    line_audio = hv.VLine(0).opts(color='black')
    line_spec = hv.VLine(0).opts(color='red')
    
    
    slider.jslink(hv_audio, value='time', bidirectional=True)
    slider.jslink(line_audio, value='glyph.location')
    slider.jslink(line_spec, value='glyph.location')
    
    time = np.linspace(0, audio_duration, num=len(audio_array))
    line_plot_hv = hv.Curve(
        (time, audio_array), ["Time (s)", "amplitude"]).opts(
        width=30, height=10, axiswise=True) * line_audio
    
    hv_spec_gram = hv.Image(
        audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
        width=30, height=10, labelled=[], axiswise=True, color_levels=512)* line_spec
    
    
    combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
    audio_html = StringIO()
    combined.save(audio_html)
    return audio_html


def dataset_to_records(dataset):
    records = []
    for item in dataset:
        record = {}
        record["audio_with_spec"] = wandb.Html(record_to_html(item))
        record["sentence"] = item["sentence"]
        record["length"] = item["length"]
        records.append(record)
    records = pd.DataFrame(records)
    return records
    
def decode_predictions(trainer, predictions):
    pred_ids = predictions.predictions
    pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
    return pred_str


def compute_measures(predictions, labels):
    measures = [jiwer.compute_measures(ps, ls) for ps, ls in zip(predictions, labels)]
    measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
    return measures_df

class WandbProgressResultsCallback(WandbCallback):
    def __init__(self, trainer, sample_dataset): 
        super().__init__()
        self.trainer = trainer
        self.sample_dataset = sample_dataset
        self.records_df = dataset_to_records(sample_dataset)
        
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        super().on_log(args, state, control, model, logs)
        predictions = trainer.predict(self.sample_dataset)
        predictions = decode_predictions(self.trainer, predictions)
        measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
        records_df = pd.concat([self.records_df, measures_df], axis=1)
        records_df["prediction"] = predictions
        records_df["step"] = state.global_step
        records_table = self._wandb.Table(dataframe=records_df)
        self._wandb.log({"sample_predictions": records_table})
        
    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if self._wandb is None:
            return
        if self._log_model and self._initialized and state.is_world_process_zero:
            with tempfile.TemporaryDirectory() as temp_dir:
                self.trainer.save_model(temp_dir)
                metadata = (
                    {
                        k: v
                        for k, v in dict(self._wandb.summary).items()
                        if isinstance(v, numbers.Number) and not k.startswith("_")
                    }
                    if not args.load_best_model_at_end
                    else {
                        f"eval/{args.metric_for_best_model}": state.best_metric,
                        "train/total_floss": state.total_flos,
                    }
                )
                artifact = self._wandb.Artifact(
                    name=f"model-{self._wandb.run.id}",
                    type="model", metadata=metadata)
                for f in Path(temp_dir).glob("*"):
                    if f.is_file():
                        with artifact.new_file(f.name, mode="wb") as fa:
                            fa.write(f.read_bytes())
                self._wandb.run.log_artifact(artifact)

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-ta",  # change to a repo name of your choice
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    save_total_limit=4,
    warmup_steps=1000,
    max_steps=10000,
    gradient_checkpointing=True,
    fp16=True,
#     fp16_full_eval=True,
    optim="adamw_bnb_8bit",
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=250,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    remove_unused_columns=False, 
    ignore_data_skip=True
)


In [21]:
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)

  0%|          | 0/10 [00:00<?, ?ex/s]

In [22]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
    
)

/home/mugan/Documents/harvard/cscie89/final_project/./whisper-tiny-ta is already a clone of https://huggingface.co/parambharat/whisper-tiny-ta. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [23]:

progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)



In [24]:
trainer.add_callback(progress_callback)

In [25]:
# model.save_pretrained(training_args.output_dir)
# processor.save_pretrained(training_args.output_dir)

In [26]:
# from time import time
# start_time = time()
# for i, item in enumerate(dataset_dict["train"]):
#     features = item["input_features"]
#     labels = item["labels"]
#     print(i, f"{time()-start_time: 0.3f}s")
#     if i == 10:
#         break 

In [None]:
trainer.train(resume_from_checkpoint=True)

Loading model from ./whisper-tiny-ta/checkpoint-5000.
***** Running training *****
  Num examples = 640000
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 10000
  Number of trainable parameters = 37760640
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 5000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Wer
6000,0.3492,0.328315,31.696575
7000,0.3229,0.321119,31.133873
8000,0.3193,0.313771,30.516119


***** Running Prediction *****
  Num examples = 10
  Batch size = 16
***** Running Prediction *****
  Num examples = 10
  Batch size = 16
***** Running Prediction *****
  Num examples = 10
  Batch size = 16
***** Running Prediction *****
  Num examples = 10
  Batch size = 16
***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 16
***** Running Prediction *****
  Num examples = 10
  Batch size = 16
Saving model checkpoint to ./whisper-tiny-ta/checkpoint-6000
Configuration saved in ./whisper-tiny-ta/checkpoint-6000/config.json
Model weights saved in ./whisper-tiny-ta/checkpoint-6000/pytorch_model.bin
Feature extractor saved in ./whisper-tiny-ta/checkpoint-6000/preprocessor_config.json
tokenizer config file saved in ./whisper-tiny-ta/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./whisper-tiny-ta/checkpoint-6000/special_tokens_map.json
added tokens file saved in ./whisper-tiny-ta/checkpoint-6000/added_tokens.json
Feature extractor saved in ./whisper-t

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "language": "ta",
    "model_name": "Whisper Tiny Ta - Bharat Ramanathan",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
    "tags": "whisper-event",
#     "training_logs": "https://wandb.ai/parambharat/whisper_finetuning/runs/2k10w4qq"
}


In [None]:
trainer.push_to_hub(**kwargs)