diff --git a/setup.py b/setup.py
index 244a58c..ea972da 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'soundstorm-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.1',
+  version = '0.1.2',
   license='MIT',
   description = 'SoundStorm - Efficient Parallel Audio Generation from Google Deepmind, in Pytorch',
   author = 'Phil Wang',
diff --git a/soundstorm_pytorch/soundstorm.py b/soundstorm_pytorch/soundstorm.py
index 2898fbe..f844105 100644
--- a/soundstorm_pytorch/soundstorm.py
+++ b/soundstorm_pytorch/soundstorm.py
@@ -1052,11 +1052,17 @@ def forward(
 
         seq_mask = mask
 
-        if not exists(seq_mask) and exists(self.pad_id):
-            seq_mask = (x != self.pad_id).any(dim = -1)
-        elif not exists(seq_mask):
+        if not exists(seq_mask):
             seq_mask = torch.ones((b, n), device = device, dtype = torch.bool)
 
+        if exists(self.pad_id):
+            pad_mask = (x == self.pad_id).any(dim = -1)
+            seq_mask = seq_mask & ~pad_mask
+
+            if self.pad_id < 0:
+                # if using say -1 for padding
+                x = torch.where(rearrange(pad_mask, 'b n -> b n 1'), 0, x)
+
         # maybe condition
 
         cond_tokens = self.maybe_get_condition(cond_semantic_token_ids, length = x.shape[-2])