Skip to content

Commit

Permalink
Changed dropout location, removed activation from highway for faster …
Browse files Browse the repository at this point in the history
…convergence
  • Loading branch information
localminimum committed Apr 27, 2018
1 parent bb5769d commit f0c79cc
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 21 deletions.
1 change: 0 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@
flags.DEFINE_float("l2_norm", 3e-7, "L2 norm scale")
flags.DEFINE_integer("hidden", 96, "Hidden size")
flags.DEFINE_integer("num_heads", 1, "Number of heads in self attention")
flags.DEFINE_boolean("q2c", True, "Whether to use query to context attention or not")
flags.DEFINE_integer("early_stop", 3, "Checkpoints for early stop")

# Extensions (Uncomment corresponding code in download.sh to download the required data)
Expand Down
12 changes: 6 additions & 6 deletions layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def layer_norm(x, filters=None, epsilon=1e-6, scope=None, reuse=None):

norm_fn = layer_norm#tf.contrib.layers.layer_norm #tf.contrib.layers.layer_norm or noam_norm

def highway(x, size = None, activation = tf.nn.relu,
def highway(x, size = None, activation = None,
num_layers = 2, scope = "highway", dropout = 0.0, reuse = None):
with tf.variable_scope(scope, reuse):
if size is None:
Expand Down Expand Up @@ -116,9 +116,9 @@ def conv_block(inputs, num_conv_layers, kernel_size, num_filters,
l, L = sublayers
for i in range(num_conv_layers):
residual = outputs
outputs = norm_fn(outputs, scope = "layer_norm_%d"%i, reuse = reuse)
if (i) % 2 == 0:
outputs = tf.nn.dropout(outputs, 1.0 - dropout)
outputs = norm_fn(outputs, scope = "layer_norm_%d"%i, reuse = reuse)
outputs = depthwise_separable_convolution(outputs,
kernel_size = (kernel_size, 1), num_filters = num_filters,
scope = "depthwise_conv_layers_%d"%i, is_training = is_training, reuse = reuse)
Expand All @@ -132,16 +132,16 @@ def self_attention_block(inputs, num_filters, seq_len, mask = None, num_heads =
with tf.variable_scope(scope, reuse = reuse):
l, L = sublayers
# Self attention
outputs = tf.nn.dropout(inputs, 1.0 - dropout)
outputs = norm_fn(outputs, scope = "layer_norm_1", reuse = reuse)
outputs = norm_fn(inputs, scope = "layer_norm_1", reuse = reuse)
outputs = tf.nn.dropout(outputs, 1.0 - dropout)
outputs = multihead_attention(outputs, num_filters,
num_heads = num_heads, seq_len = seq_len, reuse = reuse,
mask = mask, is_training = is_training, bias = bias, dropout = dropout)
residual = layer_dropout(outputs, inputs, dropout * float(l) / L)
l += 1
# Feed-forward
outputs = tf.nn.dropout(residual, 1.0 - dropout)
outputs = norm_fn(outputs, scope = "layer_norm_2", reuse = reuse)
outputs = norm_fn(residual, scope = "layer_norm_2", reuse = reuse)
outputs = tf.nn.dropout(outputs, 1.0 - dropout)
outputs = conv(outputs, num_filters, True, tf.nn.relu, name = "FFN_1", reuse = reuse)
outputs = conv(outputs, num_filters, True, None, name = "FFN_2", reuse = reuse)
outputs = layer_dropout(outputs, residual, dropout * float(l) / L)
Expand Down
21 changes: 7 additions & 14 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,19 +124,17 @@ def forward(self):
dropout = self.dropout)

with tf.variable_scope("Context_to_Query_Attention_Layer"):
C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
# C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
# Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
# S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
mask_q = tf.expand_dims(self.q_mask, 1)
S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
mask_c = tf.expand_dims(self.c_mask, 2)
S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
self.c2q = tf.matmul(S_, q)
self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
attention_outputs = [c, self.c2q, c * self.c2q]
if config.q2c:
attention_outputs.append(c * self.q2c)
attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

with tf.variable_scope("Model_Encoder_Layer"):
inputs = tf.concat(attention_outputs, axis = -1)
Expand Down Expand Up @@ -169,7 +167,7 @@ def forward(self):

outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
tf.expand_dims(tf.nn.softmax(logits2), axis=1))
outer = tf.matrix_band_part(outer, 0, 15)
outer = tf.matrix_band_part(outer, 0, config.ans_limit)
self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
losses = tf.nn.softmax_cross_entropy_with_logits(
Expand All @@ -189,16 +187,11 @@ def forward(self):
with tf.control_dependencies([ema_op]):
self.loss = tf.identity(self.loss)

self.shadow_vars = []
self.global_vars = []
self.assign_vars = []
for var in tf.global_variables():
v = self.var_ema.average(var)
if v:
self.shadow_vars.append(v)
self.global_vars.append(var)
self.assign_vars = []
for g,v in zip(self.global_vars, self.shadow_vars):
self.assign_vars.append(tf.assign(g,v))
self.assign_vars.append(tf.assign(var,v))

def get_loss(self):
return self.loss
Expand Down

0 comments on commit f0c79cc

Please sign in to comment.