In [41]:
import copy
import librosa
import numpy as np
import pretty_midi
from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor, Pop2PianoTokenizer
from encoder import encode_plus
import sys
sys.path.append("./pop2piano")

In [42]:
import copy
def crop_midi(midi, start_beat, end_beat, extrapolated_beatsteps):
    start = extrapolated_beatsteps[start_beat]
    end = extrapolated_beatsteps[end_beat]
    out = copy.deepcopy(midi)
    for note in out.instruments[0].notes.copy():
        if note.start > end or note.start < start:
            out.instruments[0].notes.remove(note)
        # interpolate index of start note

        lower = np.argmax(extrapolated_beatsteps[extrapolated_beatsteps <= note.start])
        note.start = lower
        note.start = int(note.start - start_beat)

        lower = np.argmax(extrapolated_beatsteps[extrapolated_beatsteps <= note.end])
        note.end = lower
        note.end = int(note.end - start_beat)
        if note.end == note.start:
            note.end += 1
    return out

In [43]:
model = Pop2PianoForConditionalGeneration.from_pretrained("./cache/model")
processor = Pop2PianoProcessor.from_pretrained("./cache/processor")
tokenizer = Pop2PianoTokenizer.from_pretrained("./cache/tokenizer")

print("Loaded pretrained model, processor, and tokenizer.")
# cache the model, processor, and tokenizer to avoid downloading them again
# model.save_pretrained("./cache/model")
# processor.save_pretrained("./cache/processor")
# tokenizer.save_pretrained("./cache/tokenizer")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model, processor, and tokenizer.


In [44]:
# load an example audio file and corresponding ground truth midi file
audio_path = "./processed/audio/Mountain - Mississippi Queen.ogg"
# audio_path = "./processed/audio/Aerosmith - Same Old Song & Dance.ogg"
audio, sr = librosa.load(audio_path, sr=44100)  # feel free to change the sr to a suitable value.

# convert the audio file to tokens
inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt")


# load ground truth midi file
# midi = pretty_midi.PrettyMIDI("./processed/midi/Mountain - Mississippi Queen.mid")
# ground_truth_midi_path = "./processed/midi/Mountain - Mississippi Queen.mid"
# ground_truth_midi_path = "mountain_out_gen.mid"
ground_truth_midi_path = "./processed/piano_midi/Aerosmith - Same Old Song & Dance.mid"
midi = pretty_midi.PrettyMIDI(ground_truth_midi_path)



In [45]:
inputs.beatsteps[0].shape

torch.Size([711])

In [46]:
# # convert the midi file to tokens
batches = [crop_midi(midi, i, i+8, inputs.extrapolated_beatstep[0]).instruments[0].notes for i in range(2, len(inputs.extrapolated_beatstep[0])-10, 8)]
# # remove empty batches
# batches = [batch for batch in batches if len(batch) > 0]

In [47]:
batches[-4]

[Note(start=0.000000, end=1.000000, pitch=74, velocity=77),
 Note(start=1.000000, end=2.000000, pitch=74, velocity=77),
 Note(start=2.000000, end=4.000000, pitch=43, velocity=77),
 Note(start=2.000000, end=4.000000, pitch=55, velocity=77),
 Note(start=2.000000, end=4.000000, pitch=62, velocity=77),
 Note(start=2.000000, end=4.000000, pitch=67, velocity=77),
 Note(start=2.000000, end=4.000000, pitch=69, velocity=77),
 Note(start=4.000000, end=5.000000, pitch=55, velocity=77),
 Note(start=5.000000, end=6.000000, pitch=55, velocity=77),
 Note(start=6.000000, end=7.000000, pitch=57, velocity=77),
 Note(start=6.000000, end=8.000000, pitch=45, velocity=77),
 Note(start=7.000000, end=8.000000, pitch=57, velocity=77)]

In [48]:
len(inputs.beatsteps[0])

711

In [49]:
711/8

88.875

In [50]:
len(batches)

89

In [51]:
model_output = model.generate(inputs["input_features"], generation_config=model.generation_config, return_dict_in_generate=True, output_logits=True, )  

In [52]:
labels = []
offset = 0
for batch in batches:
    # print(f"outer offset: {offset}")
    label, offset = encode_plus(tokenizer, batch, return_tensors="pt", time_offset=0)        
    labels.append(label["token_ids"])
labels = [np.append([0], np.append(label, [1, 0])) for label in labels]
# longest_length = max([len(label) for label in labels])
longest_length = len(model_output.sequences[0])
padded_labels = np.array([np.pad(label, (0, longest_length - len(label))) for label in labels])
print(padded_labels[2])



[  0 134 133  56 135 132  56 133  44  56 135 132  44  56 133  44  56 135
 132  44  56 133  46  58  66 135 132  46  58  66 133  47  59  66 135 132
  47  59 133  47  59 135 132  47  59 133  47  59 135 132  47  59 133  47
  59  71 136 132  47  59  71 135  66   1   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [53]:
def one_hot_convert(t_labels, vocab_size):
    # Your vocabulary size
    vocab_size = 2400

    # Create a tensor to hold the one-hot encoded versions
    one_hot_tensor = torch.zeros((*t_labels.shape, vocab_size))

    # Iterate over each element of the original tensor
    for i in range(t_labels.size(0)):
        for j in range(t_labels.size(1)):
            # Get the value from the original tensor
            value = int(t_labels[i, j])
            # One-hot encode the value
            one_hot = torch.zeros(vocab_size)
            one_hot[value] = 1
            # Assign it to the corresponding position in the new tensor
            one_hot_tensor[i, j] = one_hot
    return one_hot_tensor

In [54]:
from torch.nn import CrossEntropyLoss
import torch
loss_fct = CrossEntropyLoss()
logits = torch.stack(model_output.logits)
print(logits.transpose(0,1).shape)
t_labels = torch.tensor(padded_labels)
t_labels = t_labels[:,1:]
one_hot = one_hot_convert(t_labels, 2400)
# generate one hot from t_labels

# print(t_labels.shape)
loss = loss_fct(logits.transpose(0,1), one_hot)

torch.Size([89, 107, 2400])


In [55]:
loss

tensor(0.5809)