# Rough notebook to see if we can train a small model to make spelling mistakes

## TODO

- why are we using a character level tokenizer
- are the spelling mistakes the model makes any good ...

In [1]:
import pandas as pd
import transformers, json
from pathlib import Path
from datasets import Dataset
from typing import Dict, List, Optional, Sequence, Union
import time, random

## `CharacterTokenizer`

Create a character level tokenizer based on [character tokenizer](https://raw.githubusercontent.com/dariush-bahrami/character-tokenizer/master/charactertokenizer/core.py) for BERT (which was inspired by the [CANINE](https://arxiv.org/abs/2103.06874) [tokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/canine/tokenization_canine.py)) and the [t5 tokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/tokenization_t5.py)

TODO:
- not sure we need model_max_length
- test all of the save, get, from ... functions

In [2]:
class CharacterTokenizer(transformers.tokenization_utils.PreTrainedTokenizer):
    def __init__(self, characters: Sequence[str], model_max_length: int, **kwargs):
        """Character tokenizer for Hugging Face transformers.

        Args:
            characters (Sequence[str]): List of desired characters. Any character which
                is not included in this list will be replaced by a special token <unk>.

            model_max_length (int): Model maximum sequence length.
        """
        self.characters = characters
        self.model_max_length = model_max_length
        pad_token = "<pad>"
        unk_token = "<unk>"
        eos_token = "</s>"
        for token in [pad_token, unk_token, eos_token]:
            transformers.tokenization_utils.AddedToken(token)
        
        self._vocab_str_to_int = {
            pad_token: 0,
            eos_token: 1,
            unk_token: 2,
            **{ch: i + 3 for i, ch in enumerate(characters)},
        }
        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
        
        super().__init__(
            eos_token=eos_token,
            pad_token=pad_token,
            unk_token=unk_token,
            add_prefix_space=False,
            model_max_length=model_max_length,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        return len(self._vocab_str_to_int)

    def get_vocab(self):
        return self._vocab_str_to_int
    
    def _tokenize(self, text: str) -> List[str]:
        return list(text)

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab_str_to_int.get(token, 2) # default to unk_token <unk>

    def _convert_id_to_token(self, index: int) -> str:
        return self._vocab_int_to_str[index]

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)

    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
        """Do not add eos again if user already added it."""
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            return token_ids
        else:
            return token_ids + [self.eos_token_id]
        
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1
        
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # normal case: some special tokens
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + [1]
        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def get_config(self) -> Dict:
        return {
            "char_ords": [ord(ch) for ch in self.characters],
            "model_max_length": self.model_max_length,
        }

    @classmethod
    def from_config(cls, config: Dict) -> "CharacterTokenizer":
        cfg = {}
        cfg["characters"] = [chr(i) for i in config["char_ords"]]
        cfg["model_max_length"] = config["model_max_length"]
        return cls(**cfg)

    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        cfg = self.get_config()
        with open(cfg_file, "w") as f:
            json.dump(cfg, f, indent=4)

    @classmethod
    def from_pretrained(cls, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        with open(cfg_file) as f:
            cfg = json.load(f)
        return cls.from_config(cfg)

In [3]:
def demo():
    tokenizer = CharacterTokenizer(' BCDEFG', 512)
    tokenized = tokenizer('ABCDEFGH HELLO')
    print('tokenized')
    print(tokenized)
    print('\ninput_ids converted to tokens')
    print(tokenizer.convert_ids_to_tokens(tokenized['input_ids']))
demo()

tokenized
{'input_ids': [2, 4, 5, 6, 7, 8, 9, 2, 3, 2, 7, 2, 2, 2, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

input_ids converted to tokens
['<unk>', 'B', 'C', 'D', 'E', 'F', 'G', '<unk>', ' ', '<unk>', 'E', '<unk>', '<unk>', '<unk>', '</s>']


we're just using uppercase characters and hyphen for now ...

In [4]:
characters = [chr(i) for i in range(65, 91)] + ['-']
print(characters)
tokenizer = CharacterTokenizer(characters, 128)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-']


```
transformers.models.t5.configuration_t5.T5Config(
    vocab_size=32128,
    d_model=512,
    d_kv=64,
    d_ff=2048,
    num_layers=6,
    num_decoder_layers=None,
    num_heads=8,
    relative_attention_num_buckets=32,
    relative_attention_max_distance=128,
    dropout_rate=0.1,
    layer_norm_epsilon=1e-06,
    initializer_factor=1.0,
    feed_forward_proj='relu',
    is_encoder_decoder=True,
    use_cache=True,
    pad_token_id=0,
    eos_token_id=1,
    **kwargs,
)
Docstring:     
This is the configuration class to store the configuration of a [`T5Model`] or a [`TFT5Model`]. It is used to
instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the T5
[t5-small](https://huggingface.co/t5-small) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Arguments:
    vocab_size (`int`, *optional*, defaults to 32128):
        Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
    d_model (`int`, *optional*, defaults to 512):
        Size of the encoder layers and the pooler layer.
    d_kv (`int`, *optional*, defaults to 64):
        Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
        be defined as `num_heads * d_kv`.
    d_ff (`int`, *optional*, defaults to 2048):
        Size of the intermediate feed forward layer in each `T5Block`.
    num_layers (`int`, *optional*, defaults to 6):
        Number of hidden layers in the Transformer encoder.
    num_decoder_layers (`int`, *optional*):
        Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
    num_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    relative_attention_num_buckets (`int`, *optional*, defaults to 32):
        The number of buckets to use for each attention layer.
    relative_attention_max_distance (`int`, *optional*, defaults to 128):
        The maximum distance of the longer sequences for the bucket separation.
    dropout_rate (`float`, *optional*, defaults to 0.1):
        The ratio for all dropout layers.
    layer_norm_eps (`float`, *optional*, defaults to 1e-6):
        The epsilon used by the layer normalization layers.
    initializer_factor (`float`, *optional*, defaults to 1):
        A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
        testing).
    feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
        Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
        `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
```

In [5]:
# go for a v-small model of ~1m parameters - T5-Small (60 million parameters)
# TODO: not sure if these numbers give us the best model for the param budget ...
model_config = transformers.models.t5.configuration_t5.T5Config(
    decoder_start_token_id=tokenizer.pad_token_id,
    vocab_size=tokenizer.vocab_size,
    d_model=128,                        # "d_model": 512,
    d_kv=16,                            # "d_kv": 64,
    d_ff=512,                           # "d_ff": 2048,
    num_layers=3,                       # "num_layers": 6,
    num_heads=4,                        # "num_heads": 8,
    relative_attention_num_buckets=8,   # "relative_attention_num_buckets": 32,
    relative_attention_max_distance=32, # relative_attention_max_distance (`int`, *optional*, defaults to 128):
#     n_positions=128 #   "n_positions": 512
)

In [6]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print('trainable model parameters:', trainable_model_params)
    print('all model parameters:', all_model_params)
    print(f'percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%')

In [7]:
print_number_of_trainable_model_parameters(transformers.AutoModelForSeq2SeqLM.from_config(model_config))

trainable model parameters: 1087424
all model parameters: 1087424
percentage of trainable model parameters: 100.00%


BioBERT has ~110m parameters

In [8]:
df_eval = pd.concat([
    pd.read_csv('data/my-spelling-mistakes.txt', sep=' '),
    pd.read_csv('data/common_medical_misspellings.csv')
])
for c in df_eval.columns:
    df_eval[c] = df_eval[c].str.upper()
df_eval = df_eval.drop_duplicates()
df_eval

Unnamed: 0,correct,mistake
0,PLATFORM,PLAFFORM
1,PLATFORM,PLATFORN
2,EXISTING,EXISTSING
3,PYTHON,PHYTON
4,FOLLOWING,FOLOWING
...,...,...
125,TRACHEA,TRACHEIA
126,VAGINA,VAGNIA
127,VERTEBRA,VERTEBRE
128,VOMIT,VOMMIT


In [9]:
df_train = pd.concat([pd.read_csv(f'data/{f}.csv') for f in [
    'RogerMitton/roger_mitton_common_misspellings', 'wikipedia/wikipedia_common_misspellings']])
df_train = df_train.drop_duplicates()
print(len(df_train))
df_train = df_train[~df_train['correct'].isin(df_eval['correct'])]
print(len(df_train))
df_train.sample(5)

39101
38437


Unnamed: 0,correct,mistake
7781,CONSCIOUS,CONNIES
7415,CONDITIONS,GONES
13680,EXPLORATORY,EXPLARATORY
19438,LEEDS,LESS
3643,AUDITORIUM,AUDITORIM


In [10]:
max_input_length = None # 128??
max_target_length = None # 128?

def preprocess_function(examples):
    model_inputs = tokenizer(examples['correct'], max_length=max_input_length, truncation=True)
    labels = tokenizer(examples['mistake'], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
def mk_dataset(df):
    return Dataset.from_pandas(df, preserve_index=False).map(
        preprocess_function, batched=True, remove_columns=['correct', 'mistake'])
train_dataset, eval_dataset = [mk_dataset(df) for df in [df_train, df_eval]]

Map:   0%|          | 0/38437 [00:00<?, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

In [12]:
eval_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 167
})

In [13]:
finetune_model_name = f'models/t5-quick'
args = transformers.Seq2SeqTrainingArguments(
    finetune_model_name,
    optim="adamw_torch",
    evaluation_strategy='steps',
    eval_steps=100,
    save_strategy="epoch",
    learning_rate=3e-3,
    per_device_train_batch_size=256, # large batch size is fast but doesn't learn much
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15, # TODO: xxx
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    logging_steps=50
)

In [14]:
from transformers.utils.notebook import format_time

def _update(self, value: int, force_update: bool = False, comment: str = None):
        """
        The main method to update the progress bar to `value`.

        Args:
            value (`int`):
                The value to use. Must be between 0 and `total`.
            force_update (`bool`, *optional*, defaults to `False`):
                Whether or not to force and update of the internal state and display (by default, the bar will wait for
                `value` to reach the value it predicted corresponds to a time of more than the `update_every` attribute
                since the last update to avoid adding boilerplate).
            comment (`str`, *optional*):
                A comment to add on the left of the progress bar.
        """
        self.value = value
        if comment is not None:
            self.comment = comment
        if self.last_value is None:
            self.start_time = self.last_time = time.time()
            self.start_value = self.last_value = value
            self.elapsed_time = self.predicted_remaining = None
            self.first_calls = self.warmup
            self.wait_for = 1
            self.update_bar(value)
        elif value <= self.last_value and not force_update:
            return
        elif force_update or self.first_calls > 0 or value >= min(self.last_value + self.wait_for, self.total):
            if self.first_calls > 0:
                self.first_calls -= 1
            current_time = time.time()
            self.elapsed_time = current_time - self.start_time
            # We could have value = self.start_value if the update is called twixe with the same start value.
            if value > self.start_value:
                self.average_time_per_item = self.elapsed_time / (value - self.start_value)
            else:
                self.average_time_per_item = None
            if value >= self.total:
                value = self.total
                self.predicted_remaining = None
                if not self.leave:
                    self.close()
            elif self.average_time_per_item is not None:
                self.predicted_remaining = self.average_time_per_item * (self.total - value)
            self.update_bar(value)
            self.last_value = value
            self.last_time = current_time
            if self.average_time_per_item is None or self.average_time_per_item == 0:
                self.wait_for = 1
            else:
                self.wait_for = max(int(self.update_every / self.average_time_per_item), 1)

def _update_bar(self, value, comment=None):
    spaced_value = " " * (len(str(self.total)) - len(str(value))) + str(value)
    if self.elapsed_time is None:
        self.label = f"[{spaced_value}/{self.total} : < :"
    elif self.predicted_remaining is None:
        self.label = f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)}"
    else:
        self.label = (
            f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)} <"
            f" {format_time(self.predicted_remaining)}"
        )
        if self.average_time_per_item == 0:
            self.label += ", +inf it/s"
        else:
            self.label += f", {1/self.average_time_per_item:.2f} it/s"
    self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
    self.display()
    
transformers.utils.notebook.NotebookProgressBar.update = _update
transformers.utils.notebook.NotebookProgressBar.update_bar = _update_bar

In [15]:
model = transformers.AutoModelForSeq2SeqLM.from_config(model_config)
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, model=model), # dynamic padding for efficiency
    tokenizer=tokenizer,
#     compute_metrics=compute_metrics, # TODO: see if metric other than loss helps
)
trainer.train()

Step,Training Loss,Validation Loss
100,1.9959,1.992876
200,1.7122,1.835418
300,1.5756,1.725214
400,1.4182,1.49742
500,1.3252,1.347561
600,1.2373,1.138963
700,1.168,0.999473
800,1.1179,0.853691
900,1.07,0.809013
1000,1.0428,0.76669


TrainOutput(global_step=2265, training_loss=1.159190054129291, metrics={'train_runtime': 91.0563, 'train_samples_per_second': 6331.849, 'train_steps_per_second': 24.875, 'total_flos': 60098017566720.0, 'train_loss': 1.159190054129291, 'epoch': 15.0})

In [16]:
pipe = transformers.pipeline('text2text-generation', model.cpu(), tokenizer=tokenizer)

In [17]:
# one_from_the_train_set = 'YORKSHIRE'
one_from_the_train_set = random.choice(df_train['correct'].to_list())
print(one_from_the_train_set)
pipe([one_from_the_train_set]*5, max_new_tokens=50)

CONCENTRATE


[{'generated_text': 'CONSENTERATE'},
 {'generated_text': 'CONSENTRATE'},
 {'generated_text': 'CONCENTRATE'},
 {'generated_text': 'CONCENTRATE'},
 {'generated_text': 'CONCENTRATE'}]

In [18]:
# one_from_the_eval_set = 'ASPIRIN'
one_from_the_eval_set = random.choice(df_eval['correct'].to_list())
print(one_from_the_eval_set)
generation_config = transformers.generation.GenerationConfig(do_sample=False)
# generation_config = transformers.generation.GenerationConfig(do_sample=True, temperature=0.01, num_beams=1)
pipe([one_from_the_eval_set]*5, generation_config=generation_config, max_new_tokens=50)

NETWORK


[{'generated_text': 'NETWORK'},
 {'generated_text': 'NETORK'},
 {'generated_text': 'NETWERK'},
 {'generated_text': 'NETWORK'},
 {'generated_text': 'NETWORK'}]

TODO: work out how to do greedy generation

In [19]:
input_ids = tokenizer('MORTGAGE', return_tensors='pt')['input_ids']
# model.generate(input_ids, max_new_tokens=128)
tokenizer.decode(model.generate(input_ids, max_new_tokens=128)[0])

'<pad>MORTGAGE</s>'