-
Notifications
You must be signed in to change notification settings - Fork 1
/
transformer_small.yaml
163 lines (159 loc) · 16 KB
/
transformer_small.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
name: "toy_experiment" # name of experiment
joeynmt_version: "2.3.0" # joeynmt version
task: "MT" # task, either "MT" for machine translation, or "S2T" for speech-to-text
model_dir: "models/toy_transformer" # directory where models and validation results are stored, required
use_cuda: False # use CUDA for acceleration on GPU, required. Set to False when working on CPU.
fp16: True # whether to use 16-bit half-precision training instead of 32-bit training.
random_seed: 42 # set this seed to make training deterministic
data:
train: "test/data/toy/train" # training data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
dev: "test/data/toy/dev" # validation data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
test: "test/data/toy/test" # test data path (If dataset_type="huggingface", `path` argument in huggingface's datasets.load_data() func)
dataset_type: "plain" # dataset type: one of {"plain", "tsv", "huggingface"} (default: "plain")
#dataset_cfg: # dataset config (used only if dataset_type="huggingface")
# name: "de-en" # name: argument passed to huggingface's datasets.load_data() func
sample_train_subset: -1 # select this many training examples randomly for training in each epoch, default: -1 (no subsampling)
sample_dev_subset: -1 # select this many validation examples randomly for training in each epoch, default: -1 (no subsampling)
src:
lang: "de" # src language: If dataset_type is "plain", expected suffix of train files, e.g. "train.de"
max_length: 30 # filter out longer sentences from training
min_length: 1 # filter out shorter sentences from training
lowercase: False # whether to lowercase the data, also for validation and test applied
normalize: False # whether to normalize the data, also for validation and test applied
level: "bpe" # segmentation level: either "char", "word", or "bpe"
voc_limit: 400 # vocabulary only includes this many most frequent tokens, default: sys.maxsize (unlimited)
voc_min_freq: 1 # minimum frequency for a token to become part of the vocabulary
voc_file: "test/data/toy/bpe200.txt" # load a vocabulary from this file
tokenizer_type: "subword-nmt" # tokenizer type, currently either "subword-nmt" or "sentencepiece" is supported.
tokenizer_cfg: # tokenizer config
num_merges: 200 # Number of merges for subword-nmt training. Resulting vocabulary size can be slightly smaller or larger.
codes: "test/data/toy/bpe200.codes" # BPE codes file (for subword-nmt)
dropout: 0.0 # BPE dropout probability (for subword-nmt)
#model_file: # sentencepiece model file path (for sentencepiece)
#alpha: # BPE dropout probability (for sentencepiece)
#nbest_size: 10 # size of the nbest list from where you pick a tokenization, used only if alpha > 0 (for sentencepiece)
#character_coverage: 1.0 # Character coverage, default 1.0 (for sentencepiece training)
#model_type: "bpe" # Model type, default "unigram" (for sentencepiece training)
pretokenizer: "none" # either "moses" or "none" (default: "none") If you choose "moses", the data will be tokenized before any subword segmentation with the Moses tokenizer. sacremoses package has to be installed.
trg:
lang: "en" # trg language. If dataset_type is "plain", expected suffix of train files, e.g. "train.en".
max_length: 30 # filter out longer sentences from training
min_length: 1 # filter out shorter sentences from training
lowercase: False # whether to lowercase the data, also for validation and test applied
normalize: False # whether to normalize the data, also for validation and test applied
level: "bpe" # segmentation level: either "char", "word", or "bpe"
voc_limit: 400 # vocabulary only includes this many most frequent tokens, default: sys.maxsize (unlimited)
voc_min_freq: 1 # minimum frequency for a token to become part of the vocabulary
voc_file: "test/data/toy/bpe200.txt" # load a vocabulary from this file
tokenizer_type: "subword-nmt" # tokenizer type, currently either "subword-nmt" or "sentencepiece" is supported.
tokenizer_cfg: # tokenizer config
num_merges: 200 # Number of merges for subword-nmt training. Resulting vocabulary size can be slightly smaller or larger.
codes: "test/data/toy/bpe200.codes" # BPE codes file (for subword-nmt)
dropout: 0.0 # BPE dropout probability (for subword-nmt)
#model_file: # sentencepiece model file path (for sentencepiece)
#alpha: # BPE dropout probability (for sentencepiece)
#nbest_size: 10 # size of the nbest list from where you pick a tokenization, used only if alpha > 0 (for sentencepiece)
#character_coverage: 1.0 # Character coverage, default 1.0 (for sentencepiece training)
#model_type: "bpe" # Model type, default "unigram" (for sentencepiece training)
pretokenizer: "none" # either "moses" or "none" (default: "none") If you choose "moses", the data will be tokenized before any subword segmentation with the Moses tokenizer. sacremoses package has to be installed.
special_symbols: # special symbols
unk_token: "<unk>" # surface form for unknown token
pad_token: "<pad>" # surface form for pad token
bos_token: "<s>" # surface form for begin-of-sentence token
eos_token: "</s>" # surface form for end-of-sensente token
#sep_token: "<sep>" # surface form for separator; Used for prompt
unk_id: 0 # unknown token index
pad_id: 1 # pad token index
bos_id: 2 # begin-of-sentence token index
eos_id: 3 # end-of-sensence token index
#sep_id: 4 # separator token index
#lang_tags: ["<de>", "<en>"] # language tags; Used for multi-task training
testing: # specify which inference algorithm to use for testing (for validation it's always greedy decoding)
load_model: "models/toy_model/best.ckpt" # if given, load a pre-trained model from this checkpoint
n_best: 1 # n_best size, must be smaller than or equal to beam_size
beam_size: 5 # size of the beam for beam search
beam_alpha: 1.0 # length penalty for beam search
batch_size: 10 # mini-batch size for evaluation
batch_type: "sentence" # evaluation batch type ("sentence", default) or tokens ("token")
eval_metrics: ["bleu"] # validation metric, default: "bleu", other options: "chrf", "token_accuracy", "sequence_accuracy"
max_output_length: 31 # maximum output length for decoding, default: None. If set to None, allow sentences of max 1.5*src length
min_output_length: 1 # minimum output length for decoding, default: 1.
return_prob: "none" # whether to return probabilities of references ("ref") or hypotheses ("hyp"). default: "none".
return_attention: False # whether to return attention scores, default: False. (enabled if --save_attention flag is set.)
generate_unk: True # whether to generate unk token
no_repeat_ngram_size: -1 # ngram size to prohibit repetition, default -1. If set to -1, no blocker applied.
repetition_penalty: -1 # repetition penalty, default: -1. Between 0.0 and 1.0 penalizes the repeated tokens. If set to -1, no penalty applied.
sacrebleu_cfg: # sacrebleu options
whitespace: False # `whitespace` option in sacrebleu.metrics.CHRF() class (default: False)
tokenize: "13a" # `tokenize` option in sacrebleu.metrics.BLEU() class (default: 13a)
training: # specify training details here
#load_model: "toy_model/60.ckpt" # if given, load a pre-trained model from this checkpoint
reset_best_ckpt: False # if True, reset the tracking of the best checkpoint and scores. Use for domain adaptation or fine-tuning with new metrics or dev data.
reset_scheduler: False # if True, overwrite scheduler in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
reset_optimizer: False # if True, overwrite optimizer in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
reset_iter_state: False # if True, overwrite iterator state in loaded checkpoint with parameters specified in this config. Use for resumed training.
optimizer: "adamw" # choices: "sgd", "adam", "adamw", "adadelta", "adagrad", "rmsprop", default is SGD
adam_betas: [0.9, 0.999] # beta parameters for Adam. These are the defaults. Typically these are different for Transformer models.
learning_rate: 0.005 # initial learning rate, default: 3.0e-4
learning_rate_min: 0.0001 # stop learning when learning rate is reduced below this threshold, default: 1.0e-8
#learning_rate_factor: 1 # factor for Noam scheduler (used with Transformer)
#learning_rate_warmup: 4000 # warmup steps for Noam scheduler (used with Transformer)
clip_grad_val: 1.0 # clip the gradients to this value when they exceed it, optional
#clip_grad_norm: 1.0 # norm clipping instead of value clipping
weight_decay: 0. # l2 regularization, default: 0
loss: "crossentropy" # loss type, default: "crossentropy"
label_smoothing: 0.0 # label smoothing: reference tokens will have 1-label_smoothing probability instead of 1, rest of probability mass is uniformly distributed over the rest of the vocabulary, default: 0.0 (off)
batch_size: 10 # mini-batch size as number of sentences (when batch_type is "sentence"; default) or total number of tokens (when batch_type is "token"). When you use more than 1 GPUs, the actual batch size per device will be: batch_size // n_gpu.
batch_type: "sentence" # create batches with sentences ("sentence", default) or tokens ("token")
batch_multiplier: 1 # increase the effective batch size with values >1 to batch_multiplier*batch_size without increasing memory consumption by making updates only every batch_multiplier batches
normalization: "batch" # loss normalization of a mini-batch, default: "batch" (by number of sequences in batch), other options: "tokens" (by number of tokens in batch), "none" (don't normalize, sum up loss)
scheduling: "plateau" # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay", "warmupinversesquareroot"
patience: 5 # specific to plateau scheduler: wait for this many validations without improvement before decreasing the learning rate
decrease_factor: 0.5 # specific to plateau & exponential scheduler: decrease the learning rate by this factor
epochs: 1 # train for this many epochs (will be reset in resumed process)
updates: 100 # train for this many updates (won't be reset in resumed process)
validation_freq: 10 # validate after this many updates (number of mini-batches), default: 1000
logging_freq: 10 # log the training progress after this many updates, default: 100
early_stopping_metric: "loss" # when a new high score on this metric is achieved, a checkpoint is written, when "eval_metric" (default) is maximized, when "loss" or "ppl" is minimized
shuffle: True # shuffle the training data, default: True
overwrite: False # overwrite existing model directory, default: False. Do not set to True unless for debugging!
print_valid_sents: [0, 1, 2] # print this many validation sentences during each validation run, default: [0, 1, 2]
keep_best_ckpts: 3 # keep this many of the best checkpoints, if -1: all of them, default: 5
model: # specify your model architecture here
initializer: "xavier_uniform" # initializer for all trainable weights (xavier_uniform, xavier_normal, zeros, normal, uniform)
init_gain: 1.0 # gain for Xavier initializer (default: 1.0)
bias_initializer: "zeros" # initializer for bias terms (xavier_uniform, xavier_normal, zeros, normal, uniform)
embed_initializer: "xavier_uniform" # initializer for embeddings (xavier_uniform, xavier_normal, zeros, normal, uniform)
embed_init_gain: 1.0 # gain for Xavier initializer for embeddings (default: 1.0)
tied_embeddings: False # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
tied_softmax: True
encoder:
type: "transformer" # encoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
num_layers: 3 # number of layers
num_heads: 4 # number of transformer heads
embeddings:
embedding_dim: 64 # size of embeddings (for Transformer set equal to hidden_size)
scale: True # scale the embeddings by sqrt of their size, default: False
freeze: False # if True, embeddings are not updated during training
#load_pretrained: "my_data/train.trg.embed.txt" # path to the pretrained embedding file
hidden_size: 64 # size of hidden layer; must be divisible by number of heads
ff_size: 128 # size of position-wise feed-forward layer
dropout: 0.1 # apply dropout to the outputs to the transformer, default: 0.0
freeze: False # if True, encoder parameters are not updated during training (does not include embedding parameters)
layer_norm: "pre" # where to apply layer norm. either "pre" or "post". default "post"
activation: "relu" # activation function. choices: "relu", "gelu", "tanh", "swish". default: "relu"
decoder:
type: "transformer" # decoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
num_layers: 3 # number of layers
num_heads: 4 # number of transformer heads
embeddings:
embedding_dim: 64 # size of embeddings (for Transformer set equal to hidden_size)
scale: True # scale the embeddings by sqrt of their size, default: False
freeze: False # if True, embeddings are not updated during training
#load_pretrained: "my_data/train.trg.embed.txt" # path to the pretrained embedding file
hidden_size: 64 # size of hidden layer; must be divisible by number of heads
ff_size: 128 # size of position-wise feed-forward layer
dropout: 0.1 # apply dropout to the outputs to the transformer, default: 0.0
freeze: False # if True, decoder parameters are not updated during training (does not include embedding parameters, but attention)
layer_norm: "pre" # where to apply layer norm. either "pre" or "post". default "post"
activation: "relu" # activation function. choices: "relu", "gelu", "tanh", "swish". default: "relu"