In [1]:
import torch
import intel_extension_for_pytorch as ipex

  warn(
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("xpu")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [3]:
inputs = processor(
    text=["Dramatic piano music"],
    padding=True,
    return_tensors="pt",
)
inputs = {k:v.to('xpu') for k,v in inputs.items()}

with torch.no_grad():
    audio_values = model.generate(**inputs, max_new_tokens=258).cpu()

In [4]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [5]:
audio_values.shape

torch.Size([1, 1, 163200])

In [6]:
inputs = processor(
    audio=audio_values[0][0],
    text=["Drums coming in"],
    padding=True,
    return_tensors="pt",
)
inputs = {k:v.to('xpu') for k,v in inputs.items()}

with torch.no_grad():
    audio_values = model.generate(**inputs, max_new_tokens=248).cpu()

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [7]:
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [None]:
import scipy

sampling_rate = model.config.audio_encoder.sampling_rate
scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())

# lookig on the internals

we will now look at the 3 fundemental models that make this work.

1. a text encoder model that takes text and outputs  embeddings
2. an audio encoding model that translates between audio and its own tokens
3. a new token predictor for these audio tokens that can use model 1s embedding as conditionals


the pipeline goes as follows:

1. encode the user prompt into embeddings
2. encode any past musical context into tokens
3. use the decoder to predict the next tokens
4. decode the tokens back into sound

In [330]:
#we will now look at all the parts of the model

#text -> embeddings
text=set(model.text_encoder.modules())
proj=set(model.enc_to_dec_proj.modules()) 

#audio -> tokens / tokens -> audio
audio_enc=set(model.audio_encoder.modules())

#text_embeddings + audio_tokens -> next_audion_tokens
audio_dec=set(model.decoder.modules())

all_layers = text.union(audio_enc).union(audio_dec).union(proj)

In [331]:
[type(x) for x in model.modules() if x not in all_layers]

[transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration]

In [329]:
proj

{Linear(in_features=768, out_features=1024, bias=True)}

#### text to embedding

In [166]:

text_inputs = processor.tokenizer(["Drums coming in"],return_tensors='pt')
text_inputs = {k:v.xpu() for k,v in text_inputs.items()}
with torch.no_grad():
    text_emb=model.text_encoder(**text_inputs)
text_emb

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-1.8343e-01, -1.6788e-01, -1.3638e-01,  ...,  7.4312e-02,
          -5.7504e-01, -3.2075e-01],
         [-5.2078e-01,  1.2646e-01, -1.2453e-01,  ...,  2.9757e-01,
           4.8505e-02, -2.4622e-01],
         [-6.2492e-01,  1.0176e-01, -9.7727e-02,  ...,  1.6174e-01,
          -2.8732e-02, -4.1226e-02],
         [-3.7238e-01, -1.1041e-01,  2.1421e-01,  ..., -2.7740e-02,
          -3.6800e-01, -5.7025e-02],
         [-7.5804e-03, -3.5105e-04,  8.4690e-03,  ..., -1.6502e-03,
          -2.4512e-03,  1.8491e-03]]], device='xpu:0'), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [167]:
text_emb.keys()

odict_keys(['last_hidden_state'])

In [168]:
with torch.no_grad():
    text_emb = model.enc_to_dec_proj(text_emb.last_hidden_state)
text_emb.shape

torch.Size([1, 5, 1024])

#### audio to tokens

In [169]:
with torch.no_grad():
    encoded=model.audio_encoder(input_values=audio_values.xpu())

In [170]:
encoded.audio_codes

tensor([[[[ 648,  315, 1771,  ..., 1788,  773,  801],
          [1519,  971, 1048,  ..., 1954, 1958, 1753],
          [ 924, 1895, 1190,  ..., 2025, 1974, 1878],
          [1628, 1595, 1456,  ..., 1116, 1116, 1409]]]], device='xpu:0')

In [171]:
encoded.keys()

odict_keys(['audio_codes', 'audio_values'])

In [172]:
encoded.audio_codes.shape,text_emb.shape

(torch.Size([1, 1, 4, 500]), torch.Size([1, 5, 1024]))

#### embeddings + tokens to next tokens

In [173]:
with torch.no_grad():
    ans=model.decoder(encoded.audio_codes,encoder_hidden_states=text_emb)

In [174]:
ans.keys() 

odict_keys(['logits', 'past_key_values'])

In [175]:
#note that the answer acts just like any other generative transformer. we have logits and past key values
ans.logits.shape

torch.Size([4, 500, 2048])

In [176]:
model.decoder

MusicgenForCausalLM(
  (model): MusicgenModel(
    (decoder): MusicgenDecoder(
      (embed_tokens): ModuleList(
        (0-3): 4 x Embedding(2049, 1024)
      )
      (embed_positions): MusicgenSinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-23): 24 x MusicgenDecoderLayer(
          (self_attn): MusicgenAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
          )
          (activation_fn): GELUActivation()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder_attn): MusicgenAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=False)


# playing with codes

to demonstrate how the music tokens are translated to audio we will try and modify them directly

In [370]:
codes=encoded.audio_codes.clone()
codes.shape

torch.Size([1, 1, 4, 500])

In [371]:
type(model.audio_encoder)

transformers.models.encodec.modeling_encodec.EncodecModel

In [372]:
model.decoder.config.audio_channels

1

In [373]:
with torch.no_grad():
    ans=model.audio_encoder.decode(codes,audio_scales=[None])
ans

EncodecDecoderOutput(audio_values=tensor([[[-0.0230, -0.0213, -0.0169,  ...,  0.0069,  0.0074,  0.0118]]],
       device='xpu:0'))

In [374]:
ans.audio_values.shape

torch.Size([1, 1, 320000])

In [375]:
#here we will just see the original audio
Audio(ans.audio_values[0].cpu().numpy(), rate=sampling_rate)

In [376]:
#lets automate this
@torch.no_grad
def codes_to_audio(codes):
    ans=model.audio_encoder.decode(codes,audio_scales=[None])
    return ans.audio_values[0].cpu().numpy()

In [377]:
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [378]:
#changing a few things barely matters
codes[0,0,0,300]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [379]:
codes[0,0,0,300:320]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [380]:
#now we take more of it
codes[0,0,0,300:400]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [381]:
#now we took over more chanels it will break
codes[0,0,2,300:400]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [382]:
#start will still sound fine
Audio(codes_to_audio(codes), rate=sampling_rate)

In [383]:
#looking at the effects of chanels
codes=encoded.audio_codes.clone()
codes[0,0,2,300:400]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [384]:
#looking at the effects of chanels
codes=encoded.audio_codes.clone()
codes[0,0,1,300:400]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)

In [385]:
#looking at the effects of chanels
codes=encoded.audio_codes.clone()
codes[0,0,3,300:400]=23
Audio(codes_to_audio(codes[:,:,:,300:500]), rate=sampling_rate)