## Fine Tune Testing

In [None]:
import torch
mdl = 'openai/whisper-large-v2'
dts = 'Jzuluaga/atcosim_corpus'
opd = './' + mdl.split('/')[-1] + '-' + dts.split('/')[-1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Training Model : {}'.format(mdl))
print('On Dataset     : {}'.format(dts))
print('Output Dir.    : {}'.format(opd))
print('Device         : {}'.format(device))

Training Model : openai/whisper-large-v2
On Dataset     : Jzuluaga/atcosim_corpus
Output Dir.    : ./whisper-large-v2-atcosim_corpus
Device         : cuda


### Initializing Hugging Face

In [None]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset['train'] = load_dataset(dts, split="train")
dataset['test']  = load_dataset(dts, split="test")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})


### Import pretrained Whisper models

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(mdl)

from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(mdl, language="English", task="transcribe")

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(mdl, language="English", task="transcribe")

# make sure sampling rate is 16k otherwise incompatible
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/preprocessor_config.json
Feature extractor WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/vocab.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/tokenizer.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--openai-

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# Select the first four training examples and the first validation example
dataset["train"] = dataset["train"].select(range(40))
dataset["test"] = dataset["test"].select(range(10))

# num_proc has to be one, because os.fork is incompatible with multithreaded code
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1)

# dataset should now contain 'input_features' and 'labels'
dataset

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 40
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 10
    })
})

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
import json

deepspeed_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "last_batch_iteration": -1,
            "total_num_steps": "auto",
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}


# Save DeepSpeed configuration to a JSON file
import json
deepspeed_config_path = 'deepspeed_config.json'
with open(deepspeed_config_path, 'w') as f:
    json.dump(deepspeed_config, f)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=opd,
    num_train_epochs=5,
    per_device_train_batch_size=500,
    gradient_accumulation_steps=1,  # [gradient_accumulation_steps] * [Num of GPUs] = 64
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=-1,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    deepspeed=deepspeed_config_path,
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(mdl)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

model.to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--openai--whisper-large-v2/snapshots/ae4642769ce2ad8fc292556ccea8e901f1530655/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-large-v2",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1280,
  "decoder_attention_heads": 20,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 32,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 20,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 32,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tru

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
torch.cuda.empty_cache()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/./whisper-large-v2-atcosim_corpus is already a clone of https://huggingface.co/daisyyedda/whisper-large-v2-atcosim_corpus. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
import transformers

transformers.logging.set_verbosity_info()
trainer.train()
trainer.save_model(opd)

kwargs = {
    "dataset_tags": dts.split('/')[-1],
    "dataset": "ATCOSIM - CORPUS",
    "dataset_args": "config: en, split: train",
    "language": "en",
    "model_name": "Whisper Large v2 - ATCOSIM - CORPUS",
    "finetuned_from": mdl,
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}
trainer.push_to_hub()

[2024-06-02 01:21:12,314] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.2, git-hash=unknown, git-branch=unknown
[2024-06-02 01:21:12,446] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2024-06-02 01:21:14,125] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2024-06-02 01:21:14,127] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2024-06-02 01:21:14,374] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[2024-06-02 01:21:14,375] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2024-06-02 01:21:14,376] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer
[2024-06-02 01:21:14,378] [INFO] [stage_1_and_2.py:148:__init__] Reduce bucket size

***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 500
  Total train batch size (w. parallel, distributed & accumulation) = 500
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 1,543,304,960


[2024-06-02 01:21:43,427] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1


Step,Training Loss,Validation Loss


[2024-06-02 01:21:54,617] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
[2024-06-02 01:22:20,195] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./whisper-large-v2-atcosim_corpus
Configuration saved in ./whisper-large-v2-atcosim_corpus/config.json
Configuration saved in ./whisper-large-v2-atcosim_corpus/generation_config.json
Model weights saved in ./whisper-large-v2-atcosim_corpus/pytorch_model.bin
Feature extractor saved in ./whisper-large-v2-atcosim_corpus/preprocessor_config.json
Saving model checkpoint to ./whisper-large-v2-atcosim_corpus
Configuration saved in ./whisper-large-v2-atcosim_corpus/config.json
Configuration saved in ./whisper-large-v2-atcosim_corpus/generation_config.json
Model weights saved in ./whisper-large-v2-atcosim_corpus/pytorch_model.bin
Feature extractor saved in ./whisper-large-v2-atcosim_corpus/preprocessor_config.json


Upload file pytorch_model.bin:   0%|          | 1.00/3.00G [00:00<?, ?B/s]

Upload file runs/Jun01_23-26-02_e4b5d0195e08/events.out.tfevents.1717284454.e4b5d0195e08.17891.0:   0%|       …

Upload file runs/Jun02_00-33-35_e4b5d0195e08/events.out.tfevents.1717288587.e4b5d0195e08.17891.1:   0%|       …

Upload file runs/Jun02_01-17-16_e4b5d0195e08/events.out.tfevents.1717291168.e4b5d0195e08.50161.0:   0%|       …

Upload file runs/Jun02_01-08-47_e4b5d0195e08/events.out.tfevents.1717290582.e4b5d0195e08.41917.1:   0%|       …

Upload file runs/Jun02_00-45-33_e4b5d0195e08/events.out.tfevents.1717289225.e4b5d0195e08.41917.0:   0%|       …

Upload file training_args.bin:   0%|          | 1.00/5.99k [00:00<?, ?B/s]

Upload file runs/Jun02_01-20-44_e4b5d0195e08/events.out.tfevents.1717291292.e4b5d0195e08.50161.1:   0%|       …

To https://huggingface.co/daisyyedda/whisper-large-v2-atcosim_corpus
   b8f653b..8c2339f  main -> main

   b8f653b..8c2339f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/daisyyedda/whisper-large-v2-atcosim_corpus
   8c2339f..aa300d3  main -> main

   8c2339f..aa300d3  main -> main

Saving model checkpoint to ./whisper-large-v2-atcosim_corpus
Configuration saved in ./whisper-large-v2-atcosim_corpus/config.json
Configuration saved in ./whisper-large-v2-atcosim_corpus/generation_config.json
Model weights saved in ./whisper-large-v2-atcosim_corpus/pytorch_model.bin
Feature extractor saved in ./whisper-large-v2-atcosim_corpus/preprocessor_config.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}


In [None]:
mdl = 'openai/whisper-large-v2'
dts = 'Jzuluaga/atcosim_corpus'

opd = './' + mdl.split('/')[-1] + '-' + dts.split('/')[-1]
print('Training Model : {}'.format(mdl))
print('On Dataset     : {}'.format(dts))
print('Output Dir.    : {}'.format(opd))

Training Model : openai/whisper-large-v2
On Dataset     : Jzuluaga/atcosim_corpus
Output Dir.    : ./whisper-large-v2-atcosim_corpus


### Initializing Hugging Face

In [None]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [None]:
!pip install datasets
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset['train'] = load_dataset(dts, split="train")
dataset['test']  = load_dataset(dts, split="test")
print(dataset)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.03k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/488M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/468M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/470M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7638 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1901 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})


### Import pretrained Whisper models

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(mdl)

from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(mdl, language="English", task="transcribe")

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(mdl, language="English", task="transcribe")

# make sure sampling rate is 16k otherwise incompatible
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# num_proc has to be one, because os.fork is incompatible with multithreaded code
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1)

# dataset should now contain 'input_features' and 'labels'
dataset

Map:   0%|          | 0/7638 [00:00<?, ? examples/s]

Map:   0%|          | 0/1901 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 1901
    })
})

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
!pip install jiwer
!pip install evaluate
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}



In [None]:
from google.colab import files
uploaded = files.upload()

Saving ds_config.json to ds_config.json


In [None]:
import json

# Assuming the uploaded file is ds_config.json
file_name = 'ds_config.json'

# Load the config file
with open(file_name, 'r') as f:
    ds_config = json.load(f)

In [None]:
!pip install transformers==4.31.0 accelerate==0.30.1
!pip3 install deepspeed

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=opd,  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # Make sure that [gradient_accumulation_steps] * [Num of GPUs] = 64
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=12644,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    deepspeed=file_name,
)



In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(mdl)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []



generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
torch.cuda.empty_cache()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/daisyyedda/whisper-large-v2-atcosim_corpus into local empty directory.


In [None]:
import transformers

transformers.logging.set_verbosity_info()
trainer.train()
trainer.save_model(opd)

kwargs = {
    "dataset_tags": dts.split('/')[-1],
    "dataset": "ATCOSIM - CORPUS",
    "dataset_args": "config: en, split: train",
    "language": "en",
    "model_name": "Whisper Large v2 - ATCOSIM - CORPUS",
    "finetuned_from": mdl,
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}
trainer.push_to_hub()

NameError: name 'DummyOptim' is not defined