configs/transformer_small.yaml

name: "toy_experiment"              # name of experiment
joeynmt_version: "2.3.0"            # joeynmt version
task: "MT"                          # task, either "MT" for machine translation, or "S2T" for speech-to-text
model_dir: "models/toy_transformer" # directory where models and validation results are stored, required
use_cuda: False                     # use CUDA for acceleration on GPU, required. Set to False when working on CPU.
fp16: True                          # whether to use 16-bit half-precision training instead of 32-bit training.
random_seed: 42                     # set this seed to make training deterministic

data:
    train: "test/data/toy/train"    # training data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
    dev: "test/data/toy/dev"        # validation data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
    test: "test/data/toy/test"      # test data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
    dataset_type: "plain"           # dataset type: one of {"plain", "tsv", "huggingface"} (default: "plain")
    #dataset_cfg:                   # dataset config (used only if dataset_type="huggingface")
    #    name: "de-en"              # name: argument passed to huggingface's datasets.load_data() func
    sample_train_subset: -1         # select this many training examples randomly for training in each epoch, default: -1 (no subsampling)
    sample_dev_subset: -1           # select this many validation examples randomly for training in each epoch, default: -1 (no subsampling)
    src:
        lang: "de"                  # src language: If dataset_type is "plain", expected suffix of train files, e.g. "train.de"
        max_length: 30              # filter out longer sentences from training
        min_length: 1               # filter out shorter sentences from training
        lowercase: False            # whether to lowercase the data, also for validation and test applied
        normalize: False            # whether to normalize the data, also for validation and test applied
        level: "bpe"                # segmentation level: either "char", "word", or "bpe"
        voc_limit: 400              # vocabulary only includes this many most frequent tokens, default: sys.maxsize (unlimited)
        voc_min_freq: 1             # minimum frequency for a token to become part of the vocabulary
        voc_file: "test/data/toy/bpe200.txt" # load a vocabulary from this file
        tokenizer_type: "subword-nmt" # tokenizer type, currently either "subword-nmt" or "sentencepiece" is supported.
        tokenizer_cfg:              # tokenizer config
            num_merges: 200         # Number of merges for subword-nmt training. Resulting vocabulary size can be slightly smaller or larger.
            codes: "test/data/toy/bpe200.codes" # BPE codes file (for subword-nmt)
            dropout: 0.0            # BPE dropout probability (for subword-nmt)
            #model_file:            # sentencepiece model file path (for sentencepiece)
            #alpha:                 # BPE dropout probability (for sentencepiece)
            #nbest_size: 10         # size of the nbest list from where you pick a tokenization, used only if alpha > 0 (for sentencepiece)
            #character_coverage: 1.0 # Character coverage, default 1.0 (for sentencepiece training)
            #model_type: "bpe"      # Model type, default "unigram" (for sentencepiece training)
            pretokenizer: "none"    # either "moses" or "none" (default: "none") If you choose "moses", the data will be tokenized before any subword segmentation with the Moses tokenizer. sacremoses package has to be installed.
    trg:
        lang: "en"                  # trg language. If dataset_type is "plain", expected suffix of train files, e.g. "train.en".
        max_length: 30              # filter out longer sentences from training
        min_length: 1               # filter out shorter sentences from training
        lowercase: False            # whether to lowercase the data, also for validation and test applied
        normalize: False            # whether to normalize the data, also for validation and test applied
        level: "bpe"                # segmentation level: either "char", "word", or "bpe"
        voc_limit: 400              # vocabulary only includes this many most frequent tokens, default: sys.maxsize (unlimited)
        voc_min_freq: 1             # minimum frequency for a token to become part of the vocabulary
        voc_file: "test/data/toy/bpe200.txt" # load a vocabulary from this file
        tokenizer_type: "subword-nmt" # tokenizer type, currently either "subword-nmt" or "sentencepiece" is supported.
        tokenizer_cfg:              # tokenizer config
            num_merges: 200         # Number of merges for subword-nmt training. Resulting vocabulary size can be slightly smaller or larger.
            codes: "test/data/toy/bpe200.codes" # BPE codes file (for subword-nmt)
            dropout: 0.0            # BPE dropout probability (for subword-nmt)
            #model_file:            # sentencepiece model file path (for sentencepiece)
            #alpha:                 # BPE dropout probability (for sentencepiece)
            #nbest_size: 10         # size of the nbest list from where you pick a tokenization, used only if alpha > 0 (for sentencepiece)
            #character_coverage: 1.0 # Character coverage, default 1.0 (for sentencepiece training)
            #model_type: "bpe"      # Model type, default "unigram" (for sentencepiece training)
            pretokenizer: "none"    # either "moses" or "none" (default: "none") If you choose "moses", the data will be tokenized before any subword segmentation with the Moses tokenizer. sacremoses package has to be installed.
    special_symbols:                # special symbols
        unk_token: "<unk>"          # surface form for unknown token
        pad_token: "<pad>"          # surface form for pad token
        bos_token: "<s>"            # surface form for begin-of-sentence token
        eos_token: "</s>"           # surface form for end-of-sensente token
        #sep_token: "<sep>"          # surface form for separator; Used for prompt
        unk_id: 0                   # unknown token index
        pad_id: 1                   # pad token index
        bos_id: 2                   # begin-of-sentence token index
        eos_id: 3                   # end-of-sensence token index
        #sep_id: 4                   # separator token index
        #lang_tags: ["<de>", "<en>"] # language tags; Used for multi-task training

testing:                            # specify which inference algorithm to use for testing (for validation it's always greedy decoding)
    load_model: "models/toy_model/best.ckpt" # if given, load a pre-trained model from this checkpoint
    n_best: 1                       # n_best size, must be smaller than or equal to beam_size
    beam_size: 5                    # size of the beam for beam search
    beam_alpha: 1.0                 # length penalty for beam search
    batch_size: 10                  # mini-batch size for evaluation
    batch_type: "sentence"          # evaluation batch type ("sentence", default) or tokens ("token")
    eval_metrics: ["bleu"]          # validation metric, default: "bleu", other options: "chrf", "token_accuracy", "sequence_accuracy"
    max_output_length: 31           # maximum output length for decoding, default: None. If set to None, allow sentences of max 1.5*src length
    min_output_length: 1            # minimum output length for decoding, default: 1.
    return_prob: "none"             # whether to return probabilities of references ("ref") or hypotheses ("hyp"). default: "none".
    return_attention: False         # whether to return attention scores, default: False. (enabled if --save_attention flag is set.)
    generate_unk: True              # whether to generate unk token
    no_repeat_ngram_size: -1        # ngram size to prohibit repetition, default -1. If set to -1, no blocker applied.
    repetition_penalty: -1          # repetition penalty, default: -1. Between 0.0 and 1.0 penalizes the repeated tokens. If set to -1, no penalty applied.
    sacrebleu_cfg:                  # sacrebleu options
        whitespace: False           # `whitespace` option in sacrebleu.metrics.CHRF() class (default: False)
        tokenize: "13a"             # `tokenize` option in sacrebleu.metrics.BLEU() class (default: 13a)

training:                           # specify training details here
    #load_model: "toy_model/60.ckpt" # if given, load a pre-trained model from this checkpoint
    reset_best_ckpt: False          # if True, reset the tracking of the best checkpoint and scores. Use for domain adaptation or fine-tuning with new metrics or dev data.
    reset_scheduler: False          # if True, overwrite scheduler in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    reset_optimizer: False          # if True, overwrite optimizer in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    reset_iter_state: False         # if True, overwrite iterator state in loaded checkpoint with parameters specified in this config. Use for resumed training.
    optimizer: "adamw"              # choices: "sgd", "adam", "adamw", "adadelta", "adagrad", "rmsprop", default is SGD
    adam_betas: [0.9, 0.999]        # beta parameters for Adam. These are the defaults. Typically these are different for Transformer models.
    learning_rate: 0.005            # initial learning rate, default: 3.0e-4
    learning_rate_min: 0.0001       # stop learning when learning rate is reduced below this threshold, default: 1.0e-8
    #learning_rate_factor: 1        # factor for Noam scheduler (used with Transformer)
    #learning_rate_warmup: 4000     # warmup steps for Noam scheduler (used with Transformer)
    clip_grad_val: 1.0              # clip the gradients to this value when they exceed it, optional
    #clip_grad_norm: 1.0            # norm clipping instead of value clipping
    weight_decay: 0.                # l2 regularization, default: 0
    loss: "crossentropy"            # loss type, default: "crossentropy"
    label_smoothing: 0.0            # label smoothing: reference tokens will have 1-label_smoothing probability instead of 1, rest of probability mass is uniformly distributed over the rest of the vocabulary, default: 0.0 (off)
    batch_size: 10                  # mini-batch size as number of sentences (when batch_type is "sentence"; default) or total number of tokens (when batch_type is "token"). When you use more than 1 GPUs, the actual batch size per device will be: batch_size // n_gpu.
    batch_type: "sentence"          # create batches with sentences ("sentence", default) or tokens ("token")
    batch_multiplier: 1             # increase the effective batch size with values >1 to batch_multiplier*batch_size without increasing memory consumption by making updates only every batch_multiplier batches
    normalization: "batch"          # loss normalization of a mini-batch, default: "batch" (by number of sequences in batch), other options: "tokens" (by number of tokens in batch), "none" (don't normalize, sum up loss)
    scheduling: "plateau"           # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay", "warmupinversesquareroot"
    patience: 5                     # specific to plateau scheduler: wait for this many validations without improvement before decreasing the learning rate
    decrease_factor: 0.5            # specific to plateau & exponential scheduler: decrease the learning rate by this factor
    epochs: 1                       # train for this many epochs (will be reset in resumed process)
    updates: 100                    # train for this many updates (won't be reset in resumed process)
    validation_freq: 10             # validate after this many updates (number of mini-batches), default: 1000
    logging_freq: 10                # log the training progress after this many updates, default: 100
    early_stopping_metric: "loss"   # when a new high score on this metric is achieved, a checkpoint is written, when "eval_metric" (default) is maximized, when "loss" or "ppl" is minimized
    shuffle: True                   # shuffle the training data, default: True
    overwrite: False                # overwrite existing model directory, default: False. Do not set to True unless for debugging!
    print_valid_sents: [0, 1, 2]    # print this many validation sentences during each validation run, default: [0, 1, 2]
    keep_best_ckpts: 3              # keep this many of the best checkpoints, if -1: all of them, default: 5

model:                              # specify your model architecture here
    initializer: "xavier_uniform"   # initializer for all trainable weights (xavier_uniform, xavier_normal, zeros, normal, uniform)
    init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
    bias_initializer: "zeros"       # initializer for bias terms (xavier_uniform, xavier_normal, zeros, normal, uniform)
    embed_initializer: "xavier_uniform"  # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
    embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
    tied_embeddings: False          # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
    tied_softmax: True
    encoder:
        type: "transformer"         # encoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        num_layers: 3               # number of layers
        num_heads: 4                # number of transformer heads
        embeddings:
            embedding_dim: 64       # size of embeddings (for Transformer set equal to hidden_size)
            scale: True             # scale the embeddings by sqrt of their size, default: False
            freeze: False           # if True, embeddings are not updated during training
            #load_pretrained: "my_data/train.trg.embed.txt" # path to the pretrained embedding file
        hidden_size: 64             # size of hidden layer; must be divisible by number of heads
        ff_size: 128                # size of position-wise feed-forward layer
        dropout: 0.1                # apply dropout to the outputs to the transformer, default: 0.0
        freeze: False               # if True, encoder parameters are not updated during training (does not include embedding parameters)
        layer_norm: "pre"           # where to apply layer norm. either "pre" or "post". default "post"
        activation: "relu"          # activation function. choices: "relu", "gelu", "tanh", "swish". default: "relu"
    decoder:
        type: "transformer"         # decoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        num_layers: 3               # number of layers
        num_heads: 4                # number of transformer heads
        embeddings:
            embedding_dim: 64       # size of embeddings (for Transformer set equal to hidden_size)
            scale: True             # scale the embeddings by sqrt of their size, default: False
            freeze: False           # if True, embeddings are not updated during training
            #load_pretrained: "my_data/train.trg.embed.txt" # path to the pretrained embedding file
        hidden_size: 64             # size of hidden layer; must be divisible by number of heads
        ff_size: 128                # size of position-wise feed-forward layer
        dropout: 0.1                # apply dropout to the outputs to the transformer, default: 0.0
        freeze: False               # if True, decoder parameters are not updated during training (does not include embedding parameters, but attention)
        layer_norm: "pre"           # where to apply layer norm. either "pre" or "post". default "post"
        activation: "relu"          # activation function. choices: "relu", "gelu", "tanh", "swish". default: "relu"