Magnet xformers 0.0.22 compatibility fix (facebookresearch#394)

MAGNeT - Fix for xformers 0.0.22 compatibility, thanks to @nateraw catch. In addition, the following smaller fixes are also contained in this PR: * MAGNeT notebook - change to stride1 span arrangement by default. * MAGNeT doc fix of a typo. * MAGNeT music training grid typo fix.
lucataco · Jan 17, 2024 · 2a5c5e9 · 2a5c5e9
1 parent 6577d18
commit 2a5c5e9
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 12 deletions.
diff --git a/audiocraft/grids/magnet/magnet_32khz.py b/audiocraft/grids/magnet/magnet_32khz.py
@@ -12,7 +12,7 @@
 def explorer(launcher):
     partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
     launcher.slurm_(gpus=32, partition=partitions)
-    launcher.bind_(solver='magnet/magnet_base_32khz')
+    launcher.bind_(solver='magnet/magnet_32khz')
     # replace this by the desired music dataset
     launcher.bind_(dset='internal/music_400k_32khz')
 

diff --git a/audiocraft/models/lm_magnet.py b/audiocraft/models/lm_magnet.py
@@ -40,7 +40,9 @@ def __init__(self, subcodes_context: int = 5, compression_model_framerate: int =
         self.causal = kwargs['causal']
         self.subcodes_context = subcodes_context
         self.span_len = span_len
-        self._build_attn_masks(compression_model_framerate, segment_duration,
+        self._build_attn_masks(compression_model_framerate=compression_model_framerate,
+                               segment_duration=segment_duration,
+                               num_heads=kwargs['num_heads'],
                                device=kwargs['device'], dtype=kwargs['dtype'])
 
     def restricted_context_attn_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
@@ -64,12 +66,13 @@ def restricted_context_attn_mask(self, seq_len: int, device: torch.device, dtype
             torch.zeros([], device=device, dtype=dtype),
             torch.full([], float('-inf'), device=device, dtype=dtype))
 
-    def _stage_attn_mask(self, stage: int, seq_len: int,
+    def _stage_attn_mask(self, stage: int, seq_len: int, num_heads: int,
                          device: torch.device, dtype: torch.dtype) -> tp.Optional[torch.Tensor]:
         """Creates a restricted attention mask given the stage (codebook index).
         Args:
             stage (int): The codebook index. Takes values in [0, n_q].
             seq_len (int): Token sequence length.
+            num_heads (int): Num transformer attention heads.
             device (torch.device): device of the output tensor.
             dtype (torch.dtype): data type of the output tensor.
         Returns:
@@ -82,29 +85,34 @@ def _stage_attn_mask(self, stage: int, seq_len: int,
             sa_mask = self.restricted_context_attn_mask(seq_len, device=device, dtype=dtype)
 
         if sa_mask is not None:
+            # Repeat for each attention head
+            sa_mask = sa_mask.repeat((1, num_heads, 1, 1))
+
             # align8 to enable memory efficient attention
             MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR = 8
             seq_len_aligned = \
                 int(np.ceil(seq_len / MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR)) * MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR
 
-            sa_mask_aligned = torch.zeros((seq_len_aligned, seq_len_aligned), device=device, dtype=dtype)
-            sa_mask_aligned[:seq_len, :seq_len] = sa_mask
+            sa_mask_aligned = torch.zeros((1, num_heads, seq_len_aligned, seq_len_aligned), device=device, dtype=dtype)
+            sa_mask_aligned[..., :seq_len, :seq_len] = sa_mask
             sa_mask = sa_mask_aligned
 
         return sa_mask
 
-    def _build_attn_masks(self, compression_model_framerate: int, segment_duration: int,
+    def _build_attn_masks(self, compression_model_framerate: int, segment_duration: int, num_heads: int,
                           device: torch.device, dtype: torch.dtype):
         """Construct attention mask per stage. For each of the RVQ codebook levels in the [0, n_q] range,
            either a local attention map or None would be stored as an entry in the self.attn_mask_per_stage list.
         Args:
             compression_model_framerate (int): The frame rate of the tokenizer.
             segment_duration (int): Sample length in seconds.
+            num_heads (int): Num transformer attention heads.
             device (torch.device): device of the output tensor.
             dtype (torch.dtype): data type of the output tensor.
         """
         seq_len = compression_model_framerate * segment_duration
-        self.attn_mask_per_stage = [self._stage_attn_mask(stage, seq_len, device, dtype) for stage in range(self.n_q)]
+        self.attn_mask_per_stage = [self._stage_attn_mask(stage, seq_len, num_heads,
+                                                          device, dtype) for stage in range(self.n_q)]
 
     @torch.no_grad()
     def generate(self,

diff --git a/audiocraft/modules/transformer.py b/audiocraft/modules/transformer.py
@@ -401,10 +401,13 @@ def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 if custom_attn_mask:
-                    # When using a custom attn mask: move to query's device + remove align8 padding
+                    # When using a custom attn mask:
+                    # Move to query's device, repeat for each sample, remove align8 padding
                     seq_len = query.shape[1]
                     attn_mask = attn_mask.to(q.dtype)
-                    attn_mask = attn_mask[:seq_len, :seq_len]
+                    attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
+                    attn_mask = attn_mask[..., :seq_len, :seq_len]
+
                 p = self.dropout if self.training else 0
                 if _efficient_attention_backend == 'torch':
                     x = torch.nn.functional.scaled_dot_product_attention(

diff --git a/demos/magnet_demo.ipynb b/demos/magnet_demo.ipynb
@@ -60,7 +60,7 @@
     "    max_cfg_coef=10.0,\n",
     "    min_cfg_coef=1.0,\n",
     "    decoding_steps=[int(20 * model.lm.cfg.dataset.segment_duration // 10),  10, 10, 10],\n",
-    "    span_arrangement='nonoverlap'\n",
+    "    span_arrangement='stride1'\n",
     ")"
    ]
   },
@@ -153,7 +153,7 @@
     "    max_cfg_coef=20.0,\n",
     "    min_cfg_coef=1.0,\n",
     "    decoding_steps=[int(20 * model.lm.cfg.dataset.segment_duration // 10),  10, 10, 10],\n",
-    "    span_arrangement='nonoverlap'\n",
+    "    span_arrangement='stride1'\n",
     ")"
    ]
   },

diff --git a/docs/MAGNET.md b/docs/MAGNET.md
@@ -38,7 +38,7 @@ We provide a simple API and 6 pre-trained models. The pre trained models are:
 - `facebook/magnet-small-30secs`: 300M model, text to music, generates 30-second samples - [🤗 Hub](https://huggingface.co/facebook/magnet-small-30secs)
 - `facebook/magnet-medium-30secs`: 1.5B model, text to music, generates 30-second samples - [🤗 Hub](https://huggingface.co/facebook/magnet-medium-30secs)
 - `facebook/audio-magnet-small`: 300M model, text to sound-effect - [🤗 Hub](https://huggingface.co/facebook/audio-magnet-small)
-- `facebook/audio-magnet-small`: 300M model, text to sound-effect - [🤗 Hub](https://huggingface.co/facebook/audio-magnet-medium)
+- `facebook/audio-magnet-medium`: 1.5B model, text to sound-effect - [🤗 Hub](https://huggingface.co/facebook/audio-magnet-medium)
 
 In order to use MAGNeT locally **you must have a GPU**. We recommend 16GB of memory, especially for 
 the medium size models.