Skip to content

Commit

Permalink
allow for noising of gates
Browse files Browse the repository at this point in the history
  • Loading branch information
lucidrains committed Sep 21, 2023
1 parent b7ace1b commit d9f5f08
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'st-moe-pytorch',
packages = find_packages(exclude=[]),
version = '0.1.1',
version = '0.1.2',
license='MIT',
description = 'ST - Mixture of Experts - Pytorch',
author = 'Phil Wang',
Expand Down
37 changes: 32 additions & 5 deletions st_moe_pytorch/st_moe_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def cumsum_exclusive(t, dim = -3):
pre_padding = (0, 0) * num_pad_dims
return F.pad(t, (*pre_padding, 1, -1)).cumsum(dim = dim)

def log(t, eps = 1e-20):
return torch.log(t.clamp(min = eps))

def gumbel_noise(t):
noise = torch.zeros_like(t).uniform_(0, 1)
return -log(-log(noise))

# pytorch one hot throws an error if there are out of bound indices.
# tensorflow, in contrast, does not throw an error

Expand Down Expand Up @@ -378,7 +385,12 @@ def __init__(
self.straight_through_dispatch_tensor = straight_through_dispatch_tensor
self.register_buffer('zero', torch.zeros((1,)), persistent = False)

def forward(self, x):
def forward(
self,
x,
noise_gates = False,
noise_mult = 1.
):
"""
einstein notation:
Expand Down Expand Up @@ -407,6 +419,11 @@ def forward(self, x):
# gate logits and gates

gate_logits = self.to_gates(x)

if noise_gates:
noise = gumbel_noise(gate_logits)
gate_logits = gate_logits + noise * noise_mult

raw_gates = gate_logits.softmax(dim = -1)

# find top N experts per position
Expand Down Expand Up @@ -582,8 +599,13 @@ def __init__(self,
self.balance_loss_coef = balance_loss_coef
self.router_z_loss_coef = router_z_loss_coef

def forward(self, x):
dispatch_tensor, combine_tensor, balance_loss, router_z_loss = self.gate(x)
def forward(
self,
x,
noise_gates = False,
noise_mult = 1.
):
dispatch_tensor, combine_tensor, balance_loss, router_z_loss = self.gate(x, noise_gates = noise_gates, noise_mult = noise_mult)

# dispatch

Expand Down Expand Up @@ -630,7 +652,12 @@ def __init__(
self.ff_before = Expert(dim, prenorm = True) if add_ff_before else None
self.ff_after = Expert(dim, prenorm = True) if add_ff_after else None

def forward(self, x):
def forward(
self,
x,
noise_gates = False,
noise_mult = 1.
):

# feedforward before

Expand All @@ -641,7 +668,7 @@ def forward(self, x):

residual = x

moe_out, total_aux_loss, balance_loss, router_z_loss = self.moe(self.moe_prenorm(x))
moe_out, total_aux_loss, balance_loss, router_z_loss = self.moe(self.moe_prenorm(x), noise_gates = noise_gates, noise_mult = noise_mult)

x = moe_out + residual

Expand Down

0 comments on commit d9f5f08

Please sign in to comment.