In [1]:
# !git lfs install
# !git clone https://huggingface.co/datasets/parambharat/malayalam_asr_corpus

# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg

# !pip uninstall -y transformers datasets 
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install "evaluate>=0.3.0"
# !pip install jiwer
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes
# !pip install "holoviews[recommended]"

In [2]:
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_small_south_indic.ipynb

env: WANDB_LOG_MODEL=True
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=whisper_small_south_indic.ipynb


In [3]:
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio 
import evaluate

import torch
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset
from datasets import load_dataset, Audio
from pathlib import Path
import numpy as np
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE
hv.extension("bokeh", logo=False)

from io import StringIO
import pandas as pd
import warnings
import jiwer
warnings.filterwarnings('ignore')

clear_output()
torch.cuda.is_available()

True

In [4]:
run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="tiny-south-indic", resume="must", id="1mm64ukk")

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166756246171038, max=1.0))…

In [5]:
# artifact = run.use_artifact('parambharat/whisper_finetuning/model-1mm64ukk:latest', type='model')
# artifact_dir = artifact.download()

In [6]:
MODEL_NAME = "openai/whisper-tiny"

In [7]:
@dataclass
class Lang:
    name: str
    lang_id: str
        
tamil = Lang(name="Tamil", lang_id="ta")
telugu = Lang(name="Telugu", lang_id="te")
kannada = Lang(name="Kannada", lang_id="kn")
malayalam = Lang(name="Malayalam", lang_id="ml")

languages = [tamil, telugu, kannada, malayalam]

In [8]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    MODEL_NAME
)
lang_tokenizers = {
    lang.lang_id: WhisperTokenizer.from_pretrained(
        MODEL_NAME, 
        language=lang.name,
        task="transcribe",
        model_max_length=225
    )
    for lang in languages}

processor = WhisperProcessor.from_pretrained(
        MODEL_NAME,
        task="transcribe",
        model_max_length=225
    )
clear_output()

In [9]:
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
    ,])

def augment_dataset(batch):

    audio = batch["audio"]["array"]
    # apply augmentation
    augmented_audio = augment_waveform(samples=audio, sample_rate=16000)

    batch["audio"]["array"] = augmented_audio

    return batch


def fix_sentence(sentence):
    transcription = sentence
  
    if transcription.startswith('"') and transcription.endswith('"'):
        # we can remove trailing quotation marks as they do not affect the transcription
        transcription = transcription[1:-1]
  
    if transcription[-1] not in [".", "?", "!"]:
        # append a full-stop to sentences that do not end in punctuation
        transcription = transcription + "."
    transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
    return transcription
    
def prepare_dataset(examples,lang):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0].ravel()
    
    sentences = fix_sentence(examples["sentence"])
    
    # encode target text to label ids 
    examples["labels"] = lang_tokenizers[lang](sentences, max_length=225, truncation=True).input_ids
    return examples

def filter_empty_strings(sentence):
    if len(sentence) < 2:
        return False
    else: return True

In [10]:
from functools import partial
def preprocess_dataset(dataset_dict, lang=None):
    for k in dataset_dict:
        dataset_dict[k] = dataset_dict[k].cast_column("audio", Audio(sampling_rate=16000))
        
    dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)
    
    for k in dataset_dict:
        dataset_dict[k] = dataset_dict[k].filter(filter_empty_strings, input_columns=["sentence"])
    prepare_dataset_fn = partial(prepare_dataset, lang=lang)
    for k in dataset_dict:
        dataset_dict[k] = dataset_dict[k].map(
            prepare_dataset_fn,)
    return dataset_dict

In [11]:
def load_telugu_dataset(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = {}    
    dataset_dict["train"] = load_dataset("../data/telugu_asr_corpus/", split="train", streaming=is_streaming)
    dataset_dict["test"] = load_dataset("google/fleurs", "te_in", split="test", streaming=True)
    dataset_dict["test"] = dataset_dict["test"].rename_column("transcription", "sentence")
    dataset_dict["test"] = dataset_dict["test"].remove_columns(
        [col for col in dataset_dict["test"].features.keys() if col not in ["audio", "sentence"]])
    dataset_dict["test"] = dataset_dict["test"]
    dataset_dict = preprocess_dataset(dataset_dict, lang="te")
    return dataset_dict


def load_kannada_dataset(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = {}    
    dataset_dict["train"] = load_dataset("../data/kannada_asr_corpus/", split="train", streaming=is_streaming)
    dataset_dict["test"] = load_dataset("google/fleurs", "kn_in", split="test", streaming=True)
    dataset_dict["test"] = dataset_dict["test"].rename_column("transcription", "sentence")
    dataset_dict["test"] = dataset_dict["test"].remove_columns(
        [col for col in dataset_dict["test"].features.keys() if col not in ["audio", "sentence",]])
    dataset_dict["test"] = dataset_dict["test"]
    dataset_dict = preprocess_dataset(dataset_dict, lang="kn")

    return dataset_dict


def load_tamil_dataset(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = load_dataset("../data/tamil_asr_corpus/", streaming=is_streaming)
    dataset_dict["test"] = dataset_dict["test"].remove_columns(
        [col for col in dataset_dict["test"].features.keys() if col not in ["audio", "sentence",]])
    dataset_dict = preprocess_dataset(dataset_dict, lang="ta")
    dataset_dict["test"] = dataset_dict["test"]
    return dataset_dict


def load_malayalam_dataset(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = load_dataset("../data/malayalam_asr_corpus/", streaming=is_streaming)
    dataset_dict["test"] = dataset_dict["test"].remove_columns(
        [col for col in dataset_dict["test"].features.keys() if col not in ["audio", "sentence",]])

    dataset_dict = preprocess_dataset(dataset_dict, lang="ml")
    dataset_dict["test"] = dataset_dict["test"]
    return dataset_dict



In [12]:
def load_dataset_splits(is_streaming=True, stopping_strategy="all_exhausted"):
    telugu_dataset = load_telugu_dataset(is_streaming, stopping_strategy)
    kannada_dataset = load_kannada_dataset(is_streaming, stopping_strategy)
    tamil_dataset = load_tamil_dataset(is_streaming, stopping_strategy)
    malayalam_dataset = load_malayalam_dataset(is_streaming, stopping_strategy)
    
    dataset_dict = IterableDatasetDict()
    for split in ["train", "test"]:
        dataset_dict[split] = interleave_datasets(
            [
                telugu_dataset[split],
                kannada_dataset[split],
                tamil_dataset[split],
                malayalam_dataset[split]
            ],stopping_strategy=stopping_strategy)
    for k in dataset_dict:
        dataset_dict[k] = dataset_dict[k].shuffle(buffer_size=1000).with_format("torch")
        
    return dataset_dict

In [13]:
dataset_dict = load_dataset_splits(stopping_strategy="first_exhausted")

In [14]:
# for i, item in enumerate(dataset_dict["train"]):
#     print(i, item["sentence"])
#     if i == 10:
#         break

In [15]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": np.array(feature["input_features"]).reshape(80, 3000).tolist()} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
                          for feature in features]
        # pad the labels to max length
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [16]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [17]:
metric = evaluate.load("wer")

# evaluate with the 'normalised' WER
do_normalize_eval = True


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [18]:
model = WhisperForConditionalGeneration.from_pretrained("parambharat/whisper-tiny-south-indic", use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

In [19]:
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
            


In [20]:
def load_samples_dataset(dataset, num_samples=100):
    samples = []
    for i, item in enumerate(dataset):
        samples.append(item)
        if i == (num_samples-1):
            break
    sample_dataset = Dataset.from_list(samples)
    return sample_dataset

def compute_spectrograms(example):
    waveform =  example["audio"]["array"]
    specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
    return {"spectrogram": specs}


def record_to_html(sample_record):
    audio_array = np.array(sample_record["audio"]["array"])
    audio_sr = sample_record["audio"]["sampling_rate"]
    length = sample_record.get("length")
    if length:
        audio_duration = length
    else:
        sample_record["length"] = audio_duration = len(audio_array)/audio_sr
    audio_duration = sample_record["length"]
    audio_spectrogram = np.array(sample_record["spectrogram"])

    bounds = (0,0, audio_duration, audio_spectrogram.max())

    waveform_int = np.int16(audio_array * 32767)

    
    
    hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
    
    slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
    line_audio = hv.VLine(0).opts(color='black')
    line_spec = hv.VLine(0).opts(color='red')
    
    
    slider.jslink(hv_audio, value='time', bidirectional=True)
    slider.jslink(line_audio, value='glyph.location')
    slider.jslink(line_spec, value='glyph.location')
    
    time = np.linspace(0, audio_duration, num=len(audio_array))
    line_plot_hv = hv.Curve(
        (time, audio_array), ["Time (s)", "amplitude"]).opts(
        width=500, height=150, axiswise=True) * line_audio
    
    hv_spec_gram = hv.Image(
        audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
        width=500, height=150, labelled=[], axiswise=True, color_levels=512)* line_spec
    
    
    combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
    audio_html = StringIO()
    combined.save(audio_html)
    sample_record["audio_with_spec"] = audio_html
    return sample_record


def dataset_to_records(dataset):
    records = []
    for item in dataset:
        record = {}
        sample_record = record_to_html(item)
        record["audio_with_spec"] = wandb.Html(sample_record["audio_with_spec"])
        record["sentence"] = sample_record["sentence"]
        record["length"] =  sample_record["length"]
        records.append(record)
    records = pd.DataFrame(records)
    return records
    
def decode_predictions(trainer, predictions):
    pred_ids = predictions.predictions
    pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
    return pred_str


def compute_measures(predictions, labels):
    measures = [jiwer.compute_measures(ls, ps) for ps, ls in zip(predictions, labels)]
    measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
    return measures_df

class WandbProgressResultsCallback(WandbCallback):
    def __init__(self, trainer, sample_dataset): 
        super().__init__()
        self.trainer = trainer
        self.sample_dataset = sample_dataset
        self.records_df = dataset_to_records(sample_dataset)
        
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        super().on_log(args, state, control, model, logs)
        predictions = trainer.predict(self.sample_dataset)
        predictions = decode_predictions(self.trainer, predictions)
        measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
        records_df = pd.concat([self.records_df, measures_df], axis=1)
        records_df["prediction"] = predictions
        records_df["step"] = state.global_step
        records_table = self._wandb.Table(dataframe=records_df)
        self._wandb.log({"sample_predictions": records_table})
        
    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if self._wandb is None:
            return
        if self._log_model and self._initialized and state.is_world_process_zero:
            with tempfile.TemporaryDirectory() as temp_dir:
                self.trainer.save_model(temp_dir)
                metadata = (
                    {
                        k: v
                        for k, v in dict(self._wandb.summary).items()
                        if isinstance(v, numbers.Number) and not k.startswith("_")
                    }
                    if not args.load_best_model_at_end
                    else {
                        f"eval/{args.metric_for_best_model}": state.best_metric,
                        "train/total_floss": state.total_flos,
                    }
                )
                artifact = self._wandb.Artifact(
                    name=f"model-{self._wandb.run.id}",
                    type="model", metadata=metadata)
                for f in Path(temp_dir).glob("*"):
                    if f.is_file():
                        with artifact.new_file(f.name, mode="wb") as fa:
                            fa.write(f.read_bytes())
                self._wandb.run.log_artifact(artifact)

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../models/whisper-tiny-south-indic",  # change to a repo name of your choice
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    save_total_limit=4,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True,
#     fp16_full_eval=True,
    optim="adamw_bnb_8bit",
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=250,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    hub_strategy="checkpoint",
    push_to_hub=True,
    remove_unused_columns=False, 
    ignore_data_skip=True
)


In [22]:
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [24]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=samples_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],   
)

/home/mugan/Documents/harvard/whisper-finetuning/notebooks/../models/whisper-tiny-south-indic is already a clone of https://huggingface.co/parambharat/whisper-tiny-south-indic. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [25]:
progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)
clear_output()

In [26]:
trainer.add_callback(progress_callback)

In [27]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

Configuration saved in ../models/whisper-tiny-south-indic/config.json
Model weights saved in ../models/whisper-tiny-south-indic/pytorch_model.bin
Feature extractor saved in ../models/whisper-tiny-south-indic/preprocessor_config.json
tokenizer config file saved in ../models/whisper-tiny-south-indic/tokenizer_config.json
Special tokens file saved in ../models/whisper-tiny-south-indic/special_tokens_map.json
added tokens file saved in ../models/whisper-tiny-south-indic/added_tokens.json


In [None]:
trainer.train(resume_from_checkpoint=True)

Loading model from ../models/whisper-tiny-south-indic/checkpoint-500.
***** Running training *****
  Num examples = 320000
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 5000
  Number of trainable parameters = 37760640
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 500


Step,Training Loss,Validation Loss,Wer
1000,0.8741,0.555286,81.127733
1500,0.6063,0.417921,79.631761
2000,0.532,0.378456,71.173763
2500,0.4943,0.361282,71.576525
3000,0.4653,0.351451,70.080552


***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to ../models/whisper-tiny-south-indic/checkpoint-1000
Configuration saved in ../models/whisper-tiny-south-indic/checkpoint-1000/config.json
Model weights saved in ../models/whisper-tiny-south-indic/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ../models/whisper-tiny-south-indic/checkpoint-1000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-tiny-south-indic/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../models/whisper-tiny-south-indic/checkpoint-1000/special_tokens_map.json
added tokens file saved in ../models/whisper-tiny-south-indic/checkpoint-1000/added_tokens.json
Feature extractor saved in ../models/whisper-tiny-south-i

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/191M [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 32.0k/144M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   9d49dd6..63f3a01  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   63f3a01..e6caa0a  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to ../models/whisper-tiny-south-indic/checkpoint-1500
Configuration saved in ../models/whisper-tiny-south-indic/checkpoint-1500/config.json
Model weights saved in ../models/whisper-tiny-south-indic/checkpoint-1500/pytorch_model.b

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/191M [00:00<?, ?B/s]

Upload file last-checkpoint/scaler.pt: 100%|##########| 557/557 [00:00<?, ?B/s]

Upload file last-checkpoint/rng_state.pth: 100%|##########| 14.3k/14.3k [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 32.0k/144M [00:00<?, ?B/s]

Upload file last-checkpoint/scheduler.pt: 100%|##########| 627/627 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   e6caa0a..9b0aed9  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   9b0aed9..ed0b9c8  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to ../models/whisper-tiny-south-indic/checkpoint-2000
Configuration saved in ../models/whisper-tiny-south-indic/checkpoint-2000/config.json
Model weights saved in ../models/whisper-tiny-south-indic/checkpoint-2000/pytorch_model.b

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/191M [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 32.0k/144M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   ed0b9c8..6beb496  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   6beb496..71634c7  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to ../models/whisper-tiny-south-indic/checkpoint-2500
Configuration saved in ../models/whisper-tiny-south-indic/checkpoint-2500/config.json
Model weights saved in ../models/whisper-tiny-south-indic/checkpoint-2500/pytorch_model.b

Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/191M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   71634c7..1df9b55  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   1df9b55..238a561  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Prediction *****
  Num examples = 100
  Batch size = 16
Saving model checkpoint to ../models/whisper-tiny-south-indic/checkpoint-3000
Configuration saved in ../models/whisper-tiny-south-indic/checkpoint-3000/config.json
Model weights saved in ../models/whisper-tiny-south-indic/checkpoint-3000/pytorch_model.b

In [30]:
trainer.save_model()

Saving model checkpoint to ../models/whisper-tiny-south-indic
Configuration saved in ../models/whisper-tiny-south-indic/config.json
Model weights saved in ../models/whisper-tiny-south-indic/pytorch_model.bin
Feature extractor saved in ../models/whisper-tiny-south-indic/preprocessor_config.json
tokenizer config file saved in ../models/whisper-tiny-south-indic/tokenizer_config.json
Special tokens file saved in ../models/whisper-tiny-south-indic/special_tokens_map.json
added tokens file saved in ../models/whisper-tiny-south-indic/added_tokens.json
Saving model checkpoint to ../models/whisper-tiny-south-indic
Configuration saved in ../models/whisper-tiny-south-indic/config.json
Model weights saved in ../models/whisper-tiny-south-indic/pytorch_model.bin
Feature extractor saved in ../models/whisper-tiny-south-indic/preprocessor_config.json
tokenizer config file saved in ../models/whisper-tiny-south-indic/tokenizer_config.json
Special tokens file saved in ../models/whisper-tiny-south-indic/sp

Upload file pytorch_model.bin:   0%|          | 32.0k/144M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   9656d37..679c3b4  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}


In [None]:
wandb.log_artifact?

In [None]:
wandb.log_artifact(training_args.output_dir, "whisper-tiny-south-indic", type="model")

In [None]:
wandb.finish()

In [34]:
kwargs = {
    "language": ["ta", "te", "ml", "kn"],
    "model_name": "Whisper Tiny South Indic - Bharat Ramanathan",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
    "tags": "whisper-event",
}


In [35]:
trainer.push_to_hub(**kwargs)

Saving model checkpoint to ../models/whisper-tiny-south-indic
Configuration saved in ../models/whisper-tiny-south-indic/config.json
Model weights saved in ../models/whisper-tiny-south-indic/pytorch_model.bin
Feature extractor saved in ../models/whisper-tiny-south-indic/preprocessor_config.json
tokenizer config file saved in ../models/whisper-tiny-south-indic/tokenizer_config.json
Special tokens file saved in ../models/whisper-tiny-south-indic/special_tokens_map.json
added tokens file saved in ../models/whisper-tiny-south-indic/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-tiny-south-indic
   679c3b4..60822d9  main -> main



In [36]:
wandb.finish()

VBox(children=(Label(value='1070.525 MB of 1070.525 MB uploaded (347.522 MB deduped)\r'), FloatProgress(value=…

0,1
eval/loss,█▃▂▁▁
eval/runtime,▁▂▆▇█
eval/samples_per_second,█▇▃▂▁
eval/steps_per_second,██▃▁▁
eval/wer,█▇▂▂▁
train/epoch,▁▂▂▂▃▃▃▄▄▆▆▆▇▇▇█
train/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,█▇▇▆▅▄▄▃▂▂▁
train/loss,█▅▃▃▂▂▂▁▁▁▁

0,1
eval/loss,0.35145
eval/runtime,66.8197
eval/samples_per_second,1.497
eval/steps_per_second,0.105
eval/wer,70.08055
train/epoch,6.05
train/global_step,3250.0
train/learning_rate,0.0
train/loss,0.4514
