In [1]:
import copy
import librosa
import numpy as np
import pretty_midi
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor, Pop2PianoTokenizer
from encoder import encode_plus
import sys
sys.path.append("./pop2piano")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import copy
def crop_midi(midi, start_beat, end_beat, extrapolated_beatsteps):
    start = extrapolated_beatsteps[start_beat]
    end = extrapolated_beatsteps[end_beat]
    out = copy.deepcopy(midi)
    for note in out.instruments[0].notes.copy():
        if note.start > end or note.start < start:
            out.instruments[0].notes.remove(note)
        # interpolate index of start note

        # lower = len(extrapolated_beatsteps[extrapolated_beatsteps <= note.start]) - 1
        lower = np.searchsorted(extrapolated_beatsteps, note.start, side='left') - 1
        note.start = lower
        note.start = int(note.start - start_beat)

        lower = np.searchsorted(extrapolated_beatsteps, note.end, side='left') - 1
        # lower = len(extrapolated_beatsteps[extrapolated_beatsteps <= note.end]) - 1
        note.end = lower
        note.end = int(note.end - start_beat)
        if note.end == note.start:
            note.end += 1
    return out

In [3]:
model = Pop2PianoForConditionalGeneration.from_pretrained("./cache/model")
processor = Pop2PianoProcessor.from_pretrained("./cache/processor")
tokenizer = Pop2PianoTokenizer.from_pretrained("./cache/tokenizer")

print("Loaded pretrained model, processor, and tokenizer.")
# cache the model, processor, and tokenizer to avoid downloading them again
# model.save_pretrained("./cache/model")
# processor.save_pretrained("./cache/processor")
# tokenizer.save_pretrained("./cache/tokenizer")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model, processor, and tokenizer.


In [32]:
model

Pop2PianoForConditionalGeneration(
  (shared): Embedding(2400, 512)
  (mel_conditioner): Pop2PianoConcatEmbeddingToMel(
    (embedding): Embedding(21, 512)
  )
  (encoder): Pop2PianoStack(
    (embed_tokens): Embedding(2400, 512)
    (block): ModuleList(
      (0): Pop2PianoBlock(
        (layer): ModuleList(
          (0): Pop2PianoLayerSelfAttention(
            (SelfAttention): Pop2PianoAttention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): Pop2PianoLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): Pop2PianoLayerFF(
            (DenseReluDense): Pop2PianoDenseGatedActDense(
              (wi_0): Linear

In [40]:
for name, parameter in model.named_parameters():
    if any([layer in name for layer in ["block.5.layer.2.DenseReluDense.wo", "decoder.final_layer_norm", "lm_head"]]):
      print(name)

decoder.block.5.layer.2.DenseReluDense.wo.weight
decoder.final_layer_norm.weight
lm_head.weight


In [5]:
# load an example audio file and corresponding ground truth midi file
audio_path = "./processed/audio/Aerosmith - Same Old Song & Dance.ogg"
# audio_path = "./processed/audio/Aerosmith - Same Old Song & Dance.ogg"
audio, sr = librosa.load(audio_path, sr=44100)  # feel free to change the sr to a suitable value.

# convert the audio file to tokens
inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")


# load ground truth midi file
# midi = pretty_midi.PrettyMIDI("./processed/midi/Mountain - Mississippi Queen.mid")
# ground_truth_midi_path = "./processed/midi/Mountain - Mississippi Queen.mid"
# ground_truth_midi_path = "mountain_out_gen.mid"
ground_truth_midi_path = "./processed/piano_midi/Aerosmith - Same Old Song & Dance.mid"
midi = pretty_midi.PrettyMIDI(ground_truth_midi_path)



In [6]:
inputs.beatsteps[0].shape

torch.Size([823])

In [7]:
# # convert the midi file to tokens
batches = [crop_midi(midi, i, i+8, inputs.extrapolated_beatstep[0]).instruments[0].notes for i in range(2, len(inputs.extrapolated_beatstep[0])-10, 8)]
# # remove empty batches
# batches = [batch for batch in batches if len(batch) > 0]

In [8]:
batches[-4]

[Note(start=3.000000, end=5.000000, pitch=47, velocity=77),
 Note(start=1.000000, end=5.000000, pitch=52, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=47, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=52, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=59, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=64, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=68, velocity=77),
 Note(start=5.000000, end=9.000000, pitch=71, velocity=77),
 Note(start=5.000000, end=10.000000, pitch=40, velocity=77)]

In [9]:
len(inputs.beatsteps[0])

823

In [10]:
711/8

88.875

In [11]:
len(batches)

103

In [45]:
labels = []
offset = 0
for batch in batches:
    # print(f"outer offset: {offset}")
    label, offset = encode_plus(tokenizer, batch, return_tensors="pt", time_offset=0)
    print(label)
    labels.append(label["token_ids"])
labels = [np.append([0], np.append(label, [1, 0])) for label in labels]
gt_longest_length = max([len(label) for label in labels])
model_output = model.generate(inputs["input_features"], generation_config=model.generation_config, return_dict_in_generate=True, output_logits=True, min_new_tokens=gt_longest_length)
longest_length = len(model_output.sequences[0])
padded_labels = np.array([np.pad(label, (0, longest_length - len(label))) for label in labels])


{'token_ids': [134, 133, 56, 135, 132, 56, 133, 56, 63, 135, 132, 56, 63, 133, 56, 63, 56, 63, 135, 132, 56, 63, 56, 63, 133, 56, 63, 135, 132, 56, 63, 133, 56, 63, 135, 132, 56, 63, 133, 56, 63, 135, 132, 56, 63, 135, 133, 63, 135, 132, 63]}
{'token_ids': [134, 133, 66, 56, 135, 132, 66, 133, 66, 59, 63, 135, 132, 56, 66, 133, 66, 56, 135, 132, 59, 63, 66, 133, 59, 63, 66, 135, 132, 56, 133, 56, 135, 132, 59, 63, 133, 59, 63, 135, 132, 56, 133, 56, 135, 132, 56, 59, 63, 66, 133, 44, 56, 135, 132, 44, 56]}
{'token_ids': [134, 133, 44, 56, 135, 132, 44, 56, 133, 46, 58, 66, 135, 132, 46, 58, 66, 133, 47, 59, 66, 135, 132, 47, 59, 133, 47, 59, 135, 132, 47, 59, 133, 47, 59, 135, 132, 47, 59, 133, 47, 59, 71, 135, 132, 47, 59, 71, 135, 66, 133, 78, 137, 132, 78]}
{'token_ids': [134, 133, 73, 135, 132, 73, 133, 73, 135, 78, 135, 132, 73, 78, 133, 71, 78, 44, 56, 135, 132, 71, 78, 133, 71, 78, 135, 132, 71, 78, 133, 71, 78, 135, 132, 71, 78, 133, 71, 78, 135, 132, 71, 78, 133, 78, 135, 132,

KeyboardInterrupt: 

In [44]:
labels

[array([  0, 134, 133,  56, 135, 132,  56, 133,  56,  63, 135, 132,  56,
         63, 133,  56,  63,  56,  63, 135, 132,  56,  63,  56,  63, 133,
         56,  63, 135, 132,  56,  63, 133,  56,  63, 135, 132,  56,  63,
        133,  56,  63, 135, 132,  56,  63, 135, 133,  63, 135, 132,  63,
          1,   0]),
 array([  0, 134, 133,  66,  56, 135, 132,  66, 133,  66,  59,  63, 135,
        132,  56,  66, 133,  66,  56, 135, 132,  59,  63,  66, 133,  59,
         63,  66, 135, 132,  56, 133,  56, 135, 132,  59,  63, 133,  59,
         63, 135, 132,  56, 133,  56, 135, 132,  56,  59,  63,  66, 133,
         44,  56, 135, 132,  44,  56,   1,   0]),
 array([  0, 134, 133,  44,  56, 135, 132,  44,  56, 133,  46,  58,  66,
        135, 132,  46,  58,  66, 133,  47,  59,  66, 135, 132,  47,  59,
        133,  47,  59, 135, 132,  47,  59, 133,  47,  59, 135, 132,  47,
         59, 133,  47,  59,  71, 135, 132,  47,  59,  71, 135,  66, 133,
         78, 137, 132,  78,   1,   0]),
 array([  0, 1

In [13]:
gt_longest_length

84

In [14]:
padded_labels.shape

(103, 162)

In [15]:
def one_hot_convert(t_labels, vocab_size):
    # Your vocabulary size
    vocab_size = 2400

    # Create a tensor to hold the one-hot encoded versions
    one_hot_tensor = torch.zeros((*t_labels.shape, vocab_size))

    # Iterate over each element of the original tensor
    for i in range(t_labels.size(0)):
        for j in range(t_labels.size(1)):
            # Get the value from the original tensor
            value = int(t_labels[i, j])
            # One-hot encode the value
            one_hot = torch.zeros(vocab_size)
            one_hot[value] = 1
            # Assign it to the corresponding position in the new tensor
            one_hot_tensor[i, j] = one_hot
    return one_hot_tensor

In [16]:
from torch.nn import CrossEntropyLoss
import torch
loss_fct = CrossEntropyLoss()
logits = torch.stack(model_output.logits).transpose(0,1)
# logits = torch.nan_to_num(logits, nan=0.0, posinf=5, neginf=-5)
print(logits.transpose(0,1).shape)
t_labels = torch.tensor(padded_labels)
t_labels = t_labels[:,1:]
one_hot = one_hot_convert(t_labels, 2400)
# generate one hot from t_labels

# print(t_labels.shape)
loss = loss_fct(logits, one_hot)

torch.Size([161, 103, 2400])


In [17]:
infs = logits==float('-inf')
infs[0][80:86]

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [18]:
loss

tensor(1.3789)

In [24]:
model.enable_input_require_grads()
model.zero_grad()
loss = loss_fct(logits, one_hot)
loss.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [19]:
# tokenizer.num_bars = 2
# output = tokenizer.batch_decode(np.array(padded_labels),feature_extractor_output=inputs)

In [20]:
# output['pretty_midi_objects'][0].write("mountain_out_sanity_check.mid")

In [27]:
np.save("padded_labels.npy", [[1,2,3],4])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [26]:
np.load("padded_labels.npy")

array([[  0., 134., 133., ...,   0.,   0.,   0.],
       [  0., 134., 133., ...,   0.,   0.,   0.],
       [  0., 134., 133., ...,   0.,   0.,   0.],
       ...,
       [  0.,   1.,   0., ...,   0.,   0.,   0.],
       [  0.,   1.,   0., ...,   0.,   0.,   0.],
       [  0.,   1.,   0., ...,   0.,   0.,   0.]])