In [25]:
import json

from sbb_project import consts
from sbb_project.training.utils import reproducibility

In [26]:
reproducibility(42)

In [27]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
from datasets import load_dataset, DatasetDict, Audio

sbbdata = DatasetDict()

In [29]:
sbbdata = load_dataset('json',
                       data_files = {
                       'train': str(consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format('train'))),
                       'test': str(consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format('test'))),
                       'val': str(consts.MANIFEST_DIR.joinpath(consts.MANIFEST_FILE.format('val')))
                       })

Using custom data configuration default-af5cc293c562f5a7


Downloading and preparing dataset json/default to /home/user/.cache/huggingface/datasets/json/default-af5cc293c562f5a7/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/user/.cache/huggingface/datasets/json/default-af5cc293c562f5a7/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [30]:
sbbdata = sbbdata.rename_column('audio_filepath', 'audio')
sbbdata = sbbdata.rename_column('text', 'sentence')

In [31]:
sbbdata = sbbdata.cast_column('audio', Audio())

In [32]:
sbbdata = sbbdata.remove_columns('duration')

In [33]:
print(sbbdata['train'][0])

{'audio': {'path': '/home/user/code/sbb_asr/data/sbb_exchange/all_samples/audios/9dae9654-d72f-4b0c-9212-f2dc8e58f1ad.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00015259,
       -0.00030518, -0.00027466], dtype=float32), 'sampling_rate': 16000}, 'sentence': 'rangierfahrt von eins ins gleis eins drei antworten'}


In [34]:
print(sbbdata)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 270
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 34
    })
    val: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 34
    })
})


In [35]:
#sbbdata.push_to_hub('marccgrau/sbbdata')

In [36]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [37]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="German", task="transcribe")

In [38]:
input_str = sbbdata["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 rangierfahrt von eins ins gleis eins drei antworten
Decoded w/ special:    <|startoftranscript|><|de|><|transcribe|><|notimestamps|>rangierfahrt von eins ins gleis eins drei antworten<|endoftext|>
Decoded w/out special: rangierfahrt von eins ins gleis eins drei antworten
Are equal:             True


In [39]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="German", task="transcribe")

In [40]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [41]:
sbbdata = sbbdata.map(prepare_dataset, remove_columns=sbbdata.column_names["train"], num_proc=4)


      

#0:   0%|          | 0/68 [00:00<?, ?ex/s]

#3:   0%|          | 0/67 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/68 [00:00<?, ?ex/s]

#2:   0%|          | 0/67 [00:00<?, ?ex/s]

      

#0:   0%|          | 0/9 [00:00<?, ?ex/s]

#1:   0%|          | 0/9 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/8 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/8 [00:00<?, ?ex/s]

     

#1:   0%|          | 0/9 [00:00<?, ?ex/s]

  

#0:   0%|          | 0/9 [00:00<?, ?ex/s]

#3:   0%|          | 0/8 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/8 [00:00<?, ?ex/s]

In [42]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [43]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [44]:
import evaluate

metric = evaluate.load("wer")

In [45]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [46]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [47]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [48]:
tokenizer.push_to_hub("marccgrau/whisper-small-init")

CommitInfo(commit_url='https://huggingface.co/marccgrau/whisper-small-init/commit/44061fd70b13c9fff009852623a300569f1082da', commit_message='Upload tokenizer', commit_description='', oid='44061fd70b13c9fff009852623a300569f1082da', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-init", 
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=500, # overrides number of epochs
    #num_train_epochs=100,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=50,
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=sbbdata["train"],
    eval_dataset=sbbdata["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

/home/user/code/sbb_asr/notebooks/trainings/./whisper-small-init is already a clone of https://huggingface.co/marccgrau/whisper-small-init. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [25]:
trainer.train()

***** Running training *****
  Num examples = 156
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 500
  Number of trainable parameters = 241734912
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmarccgrau[0m ([33munisg-ds-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
50,0.8659,0.611944,6.493506
100,0.2183,0.072674,5.194805
150,0.0002,0.016822,0.865801
200,0.0001,0.015863,0.865801
250,0.0,0.015518,0.865801
300,0.0,0.015403,0.865801
350,0.0,0.015239,0.865801
400,0.0,0.015148,0.865801
450,0.0,0.015099,0.865801
500,0.0,0.015087,0.865801


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
`use_cache = True` is incompatible with gradient c

TrainOutput(global_step=500, training_loss=0.21527875149075407, metrics={'train_runtime': 6853.2434, 'train_samples_per_second': 2.335, 'train_steps_per_second': 0.073, 'total_flos': 4.501932244992e+18, 'train_loss': 0.21527875149075407, 'epoch': 100.0})

In [26]:
kwargs = {
    "dataset_tags": "marccgrau/sbbdata",
    "dataset": "SBB Dataset 29.11.2022",  # a 'pretty' name for the training dataset
    "dataset_args": "config: German, split: train, test, val",
    "language": "ge",
    "model_name": "Whisper Small German SBB",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "sbb-asr",
}

In [27]:
trainer.push_to_hub(**kwargs)

Saving model checkpoint to ./whisper-small-init
Configuration saved in ./whisper-small-init/config.json
Model weights saved in ./whisper-small-init/pytorch_model.bin
Feature extractor saved in ./whisper-small-init/preprocessor_config.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/marccgrau/whisper-small-init
   13ac9f0..1d694a8  main -> main

To https://huggingface.co/marccgrau/whisper-small-init
   1d694a8..2f85122  main -> main



'https://huggingface.co/marccgrau/whisper-small-init/commit/1d694a88e1c4de4d444384f17435bf9df7a65b9e'