Changed dropout location, removed activation from highway for faster …

…convergence
localminimum · Apr 27, 2018 · f0c79cc · f0c79cc
1 parent bb5769d
commit f0c79cc
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 21 deletions.
diff --git a/config.py b/config.py
@@ -107,7 +107,6 @@
 flags.DEFINE_float("l2_norm", 3e-7, "L2 norm scale")
 flags.DEFINE_integer("hidden", 96, "Hidden size")
 flags.DEFINE_integer("num_heads", 1, "Number of heads in self attention")
-flags.DEFINE_boolean("q2c", True, "Whether to use query to context attention or not")
 flags.DEFINE_integer("early_stop", 3, "Checkpoints for early stop")
 
 # Extensions (Uncomment corresponding code in download.sh to download the required data)

diff --git a/layers.py b/layers.py
@@ -68,7 +68,7 @@ def layer_norm(x, filters=None, epsilon=1e-6, scope=None, reuse=None):
 
 norm_fn = layer_norm#tf.contrib.layers.layer_norm #tf.contrib.layers.layer_norm or noam_norm
 
-def highway(x, size = None, activation = tf.nn.relu,
+def highway(x, size = None, activation = None,
             num_layers = 2, scope = "highway", dropout = 0.0, reuse = None):
     with tf.variable_scope(scope, reuse):
         if size is None:
@@ -116,9 +116,9 @@ def conv_block(inputs, num_conv_layers, kernel_size, num_filters,
         l, L = sublayers
         for i in range(num_conv_layers):
             residual = outputs
+            outputs = norm_fn(outputs, scope = "layer_norm_%d"%i, reuse = reuse)
             if (i) % 2 == 0:
                 outputs = tf.nn.dropout(outputs, 1.0 - dropout)
-            outputs = norm_fn(outputs, scope = "layer_norm_%d"%i, reuse = reuse)
             outputs = depthwise_separable_convolution(outputs,
                 kernel_size = (kernel_size, 1), num_filters = num_filters,
                 scope = "depthwise_conv_layers_%d"%i, is_training = is_training, reuse = reuse)
@@ -132,16 +132,16 @@ def self_attention_block(inputs, num_filters, seq_len, mask = None, num_heads =
     with tf.variable_scope(scope, reuse = reuse):
         l, L = sublayers
         # Self attention
-        outputs = tf.nn.dropout(inputs, 1.0 - dropout)
-        outputs = norm_fn(outputs, scope = "layer_norm_1", reuse = reuse)
+        outputs = norm_fn(inputs, scope = "layer_norm_1", reuse = reuse)
+        outputs = tf.nn.dropout(outputs, 1.0 - dropout)
         outputs = multihead_attention(outputs, num_filters,
             num_heads = num_heads, seq_len = seq_len, reuse = reuse,
             mask = mask, is_training = is_training, bias = bias, dropout = dropout)
         residual = layer_dropout(outputs, inputs, dropout * float(l) / L)
         l += 1
         # Feed-forward
-        outputs = tf.nn.dropout(residual, 1.0 - dropout)
-        outputs = norm_fn(outputs, scope = "layer_norm_2", reuse = reuse)
+        outputs = norm_fn(residual, scope = "layer_norm_2", reuse = reuse)
+        outputs = tf.nn.dropout(outputs, 1.0 - dropout)
         outputs = conv(outputs, num_filters, True, tf.nn.relu, name = "FFN_1", reuse = reuse)
         outputs = conv(outputs, num_filters, True, None, name = "FFN_2", reuse = reuse)
         outputs = layer_dropout(outputs, residual, dropout * float(l) / L)

diff --git a/model.py b/model.py
@@ -124,19 +124,17 @@ def forward(self):
                 dropout = self.dropout)
 
         with tf.variable_scope("Context_to_Query_Attention_Layer"):
-            C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
-            Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
-            S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
+            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
+            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
+            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
             S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
             mask_q = tf.expand_dims(self.q_mask, 1)
             S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
             mask_c = tf.expand_dims(self.c_mask, 2)
             S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
             self.c2q = tf.matmul(S_, q)
             self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
-            attention_outputs = [c, self.c2q, c * self.c2q]
-            if config.q2c:
-                attention_outputs.append(c * self.q2c)
+            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
 
         with tf.variable_scope("Model_Encoder_Layer"):
             inputs = tf.concat(attention_outputs, axis = -1)
@@ -169,7 +167,7 @@ def forward(self):
 
             outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                               tf.expand_dims(tf.nn.softmax(logits2), axis=1))
-            outer = tf.matrix_band_part(outer, 0, 15)
+            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
             self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
             self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
             losses = tf.nn.softmax_cross_entropy_with_logits(
@@ -189,16 +187,11 @@ def forward(self):
             with tf.control_dependencies([ema_op]):
                 self.loss = tf.identity(self.loss)
 
-                self.shadow_vars = []
-                self.global_vars = []
+                self.assign_vars = []
                 for var in tf.global_variables():
                     v = self.var_ema.average(var)
                     if v:
-                        self.shadow_vars.append(v)
-                        self.global_vars.append(var)
-                self.assign_vars = []
-                for g,v in zip(self.global_vars, self.shadow_vars):
-                    self.assign_vars.append(tf.assign(g,v))
+                        self.assign_vars.append(tf.assign(var,v))
 
     def get_loss(self):
         return self.loss