From 675059bfea3f7a6d80bc238dce7446bc2439d60d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Thu, 28 Apr 2022 12:54:20 -0400 Subject: [PATCH 1/2] "Moved scaling by dimensions per head to attention scores" --- src/transformers/models/distilbert/modeling_distilbert.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 248dbfcbbbd7..8459c430a63d 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -222,10 +222,14 @@ def unshape(x): k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) - q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + #q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = self.attention_scores_matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) + scores = scores / math.sqrt(dim_per_head) # (bs, n_heads, q_length, k_length) + mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores = scores.masked_fill(mask, -float("inf")) # (bs, n_heads, q_length, k_length) + #mask = (mask - 1.0) * 10000.0 + #scores = scores + mask.view(mask_reshp).expand_as(scores) weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) From b14a35d1350f49e94c856fa3ffa8db735e04491c Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Thu, 28 Apr 2022 13:28:29 -0400 Subject: [PATCH 2/2] "Removed temporary code." --- src/transformers/models/distilbert/modeling_distilbert.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 8459c430a63d..dbd16f0b7e6a 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -222,14 +222,11 @@ def unshape(x): k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) - #q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = self.attention_scores_matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) scores = scores / math.sqrt(dim_per_head) # (bs, n_heads, q_length, k_length) mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores = scores.masked_fill(mask, -float("inf")) # (bs, n_heads, q_length, k_length) - #mask = (mask - 1.0) * 10000.0 - #scores = scores + mask.view(mask_reshp).expand_as(scores) weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length)