# Reproduce `ZeroDivisionError` in `NotebookProgressBar`

I think the key ingredients to reproducing this error are
- small model
- small batches of data / small sequence length
- frequent evaluation and logging

In [1]:
import transformers, json, time
from datasets import Dataset
from typing import Dict, List, Optional, Sequence, Union

## `CharacterTokenizer`

Create a character level tokenizer based on [character tokenizer](https://raw.githubusercontent.com/dariush-bahrami/character-tokenizer/master/charactertokenizer/core.py) for BERT (which was inspired by the [CANINE](https://arxiv.org/abs/2103.06874) [tokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/canine/tokenization_canine.py)) and the [t5 tokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/tokenization_t5.py)

In [2]:
class CharacterTokenizer(transformers.tokenization_utils.PreTrainedTokenizer):
    def __init__(self, characters: Sequence[str], model_max_length: int, **kwargs):
        """Character tokenizer for Hugging Face transformers.

        Args:
            characters (Sequence[str]): List of desired characters. Any character which
                is not included in this list will be replaced by a special token <unk>.

            model_max_length (int): Model maximum sequence length.
        """
        self.characters = characters
        self.model_max_length = model_max_length
        pad_token = "<pad>"
        unk_token = "<unk>"
        eos_token = "</s>"
        for token in [pad_token, unk_token, eos_token]:
            transformers.tokenization_utils.AddedToken(token)
        
        self._vocab_str_to_int = {
            pad_token: 0,
            eos_token: 1,
            unk_token: 2,
            **{ch: i + 3 for i, ch in enumerate(characters)},
        }
        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
        
        super().__init__(
            eos_token=eos_token,
            pad_token=pad_token,
            unk_token=unk_token,
            add_prefix_space=False,
            model_max_length=model_max_length,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return len(self._vocab_str_to_int)

    def get_vocab(self):
        return self._vocab_str_to_int
    
    def _tokenize(self, text: str) -> List[str]:
        return list(text)

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab_str_to_int.get(token, 2) # default to unk_token <unk>

    def _convert_id_to_token(self, index: int) -> str:
        return self._vocab_int_to_str[index]

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)

    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
        """Do not add eos again if user already added it."""
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            return token_ids
        else:
            return token_ids + [self.eos_token_id]
        
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1
        
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # normal case: some special tokens
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + [1]
        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def get_config(self) -> Dict:
        return {
            "char_ords": [ord(ch) for ch in self.characters],
            "model_max_length": self.model_max_length,
        }

    @classmethod
    def from_config(cls, config: Dict) -> "CharacterTokenizer":
        cfg = {}
        cfg["characters"] = [chr(i) for i in config["char_ords"]]
        cfg["model_max_length"] = config["model_max_length"]
        return cls(**cfg)

    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        cfg = self.get_config()
        with open(cfg_file, "w") as f:
            json.dump(cfg, f, indent=4)

    @classmethod
    def from_pretrained(cls, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        with open(cfg_file) as f:
            cfg = json.load(f)
        return cls.from_config(cfg)

In [3]:
characters = [chr(i) for i in range(65, 91)] + ['-']
print(characters)
tokenizer = CharacterTokenizer(characters, 128)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-']


In [4]:
# go for a v-small model of ~1m parameters - T5-Small (60 million parameters)
# TODO: not sure if these numbers give us the best model for the param budget ...
model_config = transformers.models.t5.configuration_t5.T5Config(
    decoder_start_token_id=tokenizer.pad_token_id,
    vocab_size=tokenizer.vocab_size,
    d_model=128,                        # "d_model": 512,
    d_kv=16,                            # "d_kv": 64,
    d_ff=512,                           # "d_ff": 2048,
    num_layers=3,                       # "num_layers": 6,
    num_heads=4,                        # "num_heads": 8,
    relative_attention_num_buckets=8,   # "relative_attention_num_buckets": 32,
    relative_attention_max_distance=32, # relative_attention_max_distance (`int`, *optional*, defaults to 128):
)

In [5]:
data = dict(
    source=['EXCEPT', 'EXERCISE', 'EXHILARATION', 'EXISTENCE', 'EXISTENCE', 'EXPEDIENT', 'EXPLICITLY', 'EXTENSIONS'] * 999,
    target=['EXCPT', 'EXCERCISE', 'EXHILERATION', 'EGSISTENCE', 'EXISTANCE', 'EXSPIDIENT', 'EXPLEKIYLY', 'EXTIONS'] * 999)

In [6]:
dataset = Dataset.from_dict(data).train_test_split(test_size=0.1, shuffle=True)
train_dataset = dataset['train']
val_dataset = dataset['test']
dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 7192
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 800
    })
})

In [7]:
max_input_length = None # 128??
max_target_length = None # 128?

def preprocess_function(examples):
    model_inputs = tokenizer(examples['source'], max_length=max_input_length, truncation=True)
    labels = tokenizer(examples['target'], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['source', 'target']
)

Map:   0%|          | 0/7192 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [9]:
finetune_model_name = f'models/spelling-corrector/quick1'
args = transformers.Seq2SeqTrainingArguments(
    finetune_model_name,
    optim="adamw_torch",
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy="epoch",
    learning_rate=3e-3,
    per_device_train_batch_size=13,
    per_device_eval_batch_size=13,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    logging_steps=50
)

In [10]:
model = transformers.AutoModelForSeq2SeqLM.from_config(model_config)
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer
)
trainer.train()

Step,Training Loss,Validation Loss
100,0.109,0.029047


ZeroDivisionError: float division by zero

# The problem &uarr;

# The fix &darr;

In [11]:
from transformers.utils.notebook import format_time

def _update_bar(self, value, comment=None):
    spaced_value = " " * (len(str(self.total)) - len(str(value))) + str(value)
    if self.elapsed_time is None:
        self.label = f"[{spaced_value}/{self.total} : < :"
    elif self.predicted_remaining is None:
        self.label = f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)}"
    else:
        self.label = (
            f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)} <"
            f" {format_time(self.predicted_remaining)}"
        )
        if self.average_time_per_item == 0:
            self.label += ", +inf it/s"
        else:
            self.label += f", {1/self.average_time_per_item:.2f} it/s"
    self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
    self.display()
    
transformers.utils.notebook.NotebookProgressBar.update_bar = _update_bar

old
```
self.label += f", {1/self.average_time_per_item:.2f} it/s"
```
new
```
if self.average_time_per_item == 0:
    self.label += ", +inf it/s"
else:
    self.label += f", {1/self.average_time_per_item:.2f} it/s"
```

In [12]:
def _update(self, value: int, force_update: bool = False, comment: str = None):
        """
        The main method to update the progress bar to `value`.

        Args:
            value (`int`):
                The value to use. Must be between 0 and `total`.
            force_update (`bool`, *optional*, defaults to `False`):
                Whether or not to force and update of the internal state and display (by default, the bar will wait for
                `value` to reach the value it predicted corresponds to a time of more than the `update_every` attribute
                since the last update to avoid adding boilerplate).
            comment (`str`, *optional*):
                A comment to add on the left of the progress bar.
        """
        self.value = value
        if comment is not None:
            self.comment = comment
        if self.last_value is None:
            self.start_time = self.last_time = time.time()
            self.start_value = self.last_value = value
            self.elapsed_time = self.predicted_remaining = None
            self.first_calls = self.warmup
            self.wait_for = 1
            self.update_bar(value)
        elif value <= self.last_value and not force_update:
            return
        elif force_update or self.first_calls > 0 or value >= min(self.last_value + self.wait_for, self.total):
            if self.first_calls > 0:
                self.first_calls -= 1
            current_time = time.time()
            self.elapsed_time = current_time - self.start_time
            # We could have value = self.start_value if the update is called twixe with the same start value.
            if value > self.start_value:
                self.average_time_per_item = self.elapsed_time / (value - self.start_value)
            else:
                self.average_time_per_item = None
            if value >= self.total:
                value = self.total
                self.predicted_remaining = None
                if not self.leave:
                    self.close()
            elif self.average_time_per_item is not None:
                self.predicted_remaining = self.average_time_per_item * (self.total - value)
            self.update_bar(value)
            self.last_value = value
            self.last_time = current_time
            if self.average_time_per_item is None or self.average_time_per_item == 0:
                self.wait_for = 1
            else:
                self.wait_for = max(int(self.update_every / self.average_time_per_item), 1)

transformers.utils.notebook.NotebookProgressBar.update = _update

old
```
if self.average_time_per_item is None:
```
new
```
if self.average_time_per_item is None or self.average_time_per_item == 0:
```

In [13]:
model = transformers.AutoModelForSeq2SeqLM.from_config(model_config)
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer
)
trainer.train()

Step,Training Loss,Validation Loss
100,0.1001,0.048945
200,0.0539,0.035581
300,0.0421,0.024348
400,0.0306,0.018407
500,0.025,0.017823


TrainOutput(global_step=554, training_loss=0.11943072145165949, metrics={'train_runtime': 18.678, 'train_samples_per_second': 385.053, 'train_steps_per_second': 29.661, 'total_flos': 593675777280.0, 'train_loss': 0.11943072145165949, 'epoch': 1.0})