This repository has been archived by the owner on May 25, 2020. It is now read-only.
/
config.py
151 lines (124 loc) · 7.47 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
from cakechat.utils.data_structures import create_namedtuple_instance
from cakechat.utils.env import is_dev_env
MODEL_NAME = 'cakechat_v2.0_keras_tf'
INTX = 'uint16' # use unsigined 16-bits int representation for memory efficiency
RANDOM_SEED = 42 # Fix the random seed to a certain value to make everything reproducible
# AWS S3 params
S3_MODELS_BUCKET_NAME = 'cake-chat-data-v2' # S3 bucket with all the data
S3_NN_MODEL_REMOTE_DIR = 'nn_models' # S3 remote directory with models itself
S3_TOKENS_IDX_REMOTE_DIR = 'tokens_index' # S3 remote directory with tokens index
S3_CONDITIONS_IDX_REMOTE_DIR = 'conditions_index' # S3 remote directory with conditions index
S3_W2V_REMOTE_DIR = 'w2v_models' # S3 remote directory with pre-trained w2v models
# train datasets
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')
PROCESSED_CORPUS_DIR = os.path.join(DATA_PATH, 'corpora_processed')
TOKEN_INDEX_DIR = os.path.join(DATA_PATH, 'tokens_index') # Path to prepared tokens index directory
CONDITION_IDS_INDEX_DIR = os.path.join(DATA_PATH, 'conditions_index') # Path to prepared conditions index directory
# train & val data params
BASE_CORPUS_NAME = 'processed_dialogs' # Basic corpus name prefix
TRAIN_CORPUS_NAME = 'train_' + BASE_CORPUS_NAME # Training dataset filename prefix
CONTEXT_SENSITIVE_VAL_CORPUS_NAME = 'val_' + BASE_CORPUS_NAME # Validation dataset filename prefix for intermediate
CONTEXT_SENSITIVE_TEST_CORPUS_NAME = 'test_' + BASE_CORPUS_NAME # Testing dataset for final metrics calculation
MAX_VAL_LINES_NUM = 10000 # Max lines number from validation set to be used for metrics calculation
# test datasets
TEST_DATA_DIR = os.path.join(DATA_PATH, 'quality')
CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set' # Context-free validation set path
TEST_CORPUS_NAME = 'context_free_test_set' # Context-free test set path
QUESTIONS_CORPUS_NAME = 'context_free_questions' # Context-free questions only path
# directory to store model wights and calcualted metrics
RESULTS_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'results') # Directory to store training results
TENSORBOARD_LOG_DIR = os.path.join(RESULTS_PATH, 'tensorboard') # Path to tensorboard logs directory
# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True # Whether to use word2vec to pre-train weights for the embedding layer
TRAIN_WORD_EMBEDDINGS_LAYER = True # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(DATA_PATH, 'w2v_models') # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128 # word2vec embedding dimension
W2V_WINDOW_SIZE = 10 # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True # Use skip-gram word2vec mode. When False, CBOW is used
TOKEN_REPRESENTATION_SIZE = 256
MIN_WORD_FREQ = 1 # Minimum frequency of a word to be used in word2vec pre-calculation
VOCABULARY_MAX_SIZE = 50000 # Maximum vocabulary size in tokens
MAX_CONDITIONS_NUM = 5 # Maximum conditions num
# condition inputs. We use five major emotions to condition our model's predictions
EMOTIONS_TYPES = create_namedtuple_instance(
'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness')
DEFAULT_CONDITION = EMOTIONS_TYPES.neutral # Default condition to be used during the prediction (if not specified)
CONDITION_EMBEDDING_DIMENSION = 128 # Conditions embedding layer dimension to be trained.
# NN architecture params
HIDDEN_LAYER_DIMENSION = 768 # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2 # Use dropout with the given ratio before decoder's output
USE_CUDNN = bool(os.environ.get('CUDA_VISIBLE_DEVICES')) # True by default for GPU-enable machines (provides ~25% inference
# speed up) and False on CPU-only machines since they does not support CuDNN
# training params
EPOCHS_NUM = 2 # Total epochs num
BATCH_SIZE = 196 # Number of samples to be used for gradient estimation on each train step. In case of using multiple
# GPUs for train, each worker will have this number of samples on each step.
SHUFFLE_TRAINING_BATCHES = True # Shuffle training batches in the dataset each epoch
INPUT_SEQUENCE_LENGTH = 30 # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3 # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32 # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
GRAD_CLIP = 5.0 # Gradient clipping param passed to optimizer
LEARNING_RATE = 6.0 # Learning rate for Adadelta optimzer
LOG_RUN_METADATA = False # Set 'True' to profile memory consumption and computation time on tensorboard
AUTOENCODER_MODE = False # Set 'True' to switch seq2seq (x -> y) into autoencoder (x -> x). Used for debugging
# predictions params
MAX_PREDICTIONS_LENGTH = 40 # Max. number of tokens which can be generated on the prediction step
PREDICTION_MODES = create_namedtuple_instance(
'PREDICTION_MODES',
beamsearch='beamsearch',
beamsearch_reranking='beamsearch_reranking',
sampling='sampling',
sampling_reranking='sampling_reranking')
PREDICTION_MODE_FOR_TESTS = PREDICTION_MODES.sampling # Default prediction mode used in metrics computation
PREDICTION_DISTINCTNESS_NUM_TOKENS = 50000 # Number of tokens which should be generated to compute distinctness metric
# Prediction probabilities modifiers
REPETITION_PENALIZE_COEFFICIENT = 10.0 # Divide the probabilities of the tokens already have been used during decoding
NON_PENALIZABLE_TOKENS = ['a', 'an', 'the', '*', '.', ',', '?', '!', '\'', '"', '^', '`'] # Exclude these tokens from
# repetition penalization modifier
# Options for sampling and sampling-reranking options
DEFAULT_TEMPERATURE = 0.5 # Default softmax temperature used for sampling
# Options for beamsearch and sampling-reranking:
BEAM_SIZE = 10 # Size of the beam (beamsearch only)
SAMPLES_NUM_FOR_RERANKING = 10 # Number of samples used in reranking (sampling-reranking only)
MMI_REVERSE_MODEL_SCORE_WEIGHT = 1.0 # Weight for MMI reranking reverse-model score, see the paper:
# 0.0 - scoring is performing using completely the default model, 1.0 - using completely the reverse model
# Logging params
LOG_CANDIDATES_NUM = 3 # Number of candidates to be printed to output during the logging
SCREEN_LOG_NUM_TEST_LINES = 10 # Number of first test lines to use when logging outputs on screen
EVAL_STATE_PER_BATCHES = 500 # How many batches to train until next metrics computed for TensorBoard
# Use reduced params values for development
if is_dev_env():
# train & val data params
MAX_VAL_LINES_NUM = 10
# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True
TRAIN_WORD_EMBEDDINGS_LAYER = True
WORD_EMBEDDING_DIMENSION = 64
VOCABULARY_MAX_SIZE = 1000
MAX_CONDITIONS_NUM = 5
# condition inputs
CONDITION_EMBEDDING_DIMENSION = 1
# NN architecture params
HIDDEN_LAYER_DIMENSION = 128
DENSE_DROPOUT_RATIO = 0.2
USE_CUDNN = False
# training params
INPUT_SEQUENCE_LENGTH = 3
INPUT_CONTEXT_SIZE = 1
OUTPUT_SEQUENCE_LENGTH = 5
BATCH_SIZE = 4
SHUFFLE_TRAINING_BATCHES = False
EPOCHS_NUM = 4
LEARNING_RATE = 1.0
LOG_RUN_METADATA = False
AUTOENCODER_MODE = False
# predictions params
MAX_PREDICTIONS_LENGTH = 4
# options for beamsearch and sampling-reranking:
SAMPLES_NUM_FOR_RERANKING = 5
BEAM_SIZE = 5
# logging params
LOG_CANDIDATES_NUM = 3
SCREEN_LOG_NUM_TEST_LINES = 4
EVAL_STATE_PER_BATCHES = 5