In [21]:

NOTE_SIZE = 128
DUR_SIZE = 160
TIM_SIZE = 1000
VEL_SIZE = 128


NOTE_TOKS = [f'n{i}' for i in range(NOTE_SIZE)] 
DUR_TOKS = [f'd{i}' for i in range(DUR_SIZE)]
TIM_TOKS = [f't{i}' for i in range(TIM_SIZE)]
VEL_TOKS = [f'v{i}' for i in range(VEL_SIZE)]

VOCAB = NOTE_TOKS + DUR_TOKS + TIM_TOKS + VEL_TOKS

DICT = [(element, index) for index, element in enumerate(VOCAB)]


In [22]:
from torchtext.vocab import vocab
from collections import OrderedDict

custom_vocab = vocab(OrderedDict(DICT))
itos_vocab = custom_vocab.get_itos()

In [23]:
import numpy as np
from tqdm import tqdm


In [24]:
from music21 import *
import os


# Load the MIDI file
midi_file = midi.MidiFile()

les_tokens = []

folder_path = "midi_data"  

# Get all the file names in the folder
file_names = os.listdir(folder_path)
for f in tqdm(file_names):
    print(f)
    midi_file = midi.MidiFile()
    midi_file.open("midi_data/" +f)
    midi_file.read()
    midi_file.close()
    # Create a stream from the MIDI file
    stream = midi.translate.midiFileToStream(midi_file)

    # Iterate over the notes in the stream and extract the note information
    last_time = 0

    for note in stream.flat.notes:
        if note.isNote:
            note_pitch = note.pitch.midi
            # A terme il faudra arrondir plutot que de prendre la partie entiere
            note_duration = int(note.duration.quarterLength*4)
            note_offset = int(note.offset*4 - last_time)
            last_time = note.offset*4
            note_velocity = note.volume.velocity
            les_tokens.append(NOTE_TOKS[note_pitch])
            les_tokens.append(DUR_TOKS[note_duration])
            les_tokens.append(TIM_TOKS[note_offset])
            les_tokens.append(VEL_TOKS[note_velocity])
            # print("Note Pitch:", note_pitch)
            # print("Note Duration:", note_duration)
            # print("Note TimeShift:", note_offset)
            # print("Note Velocity:", note_velocity)

        if note.isChord:

            for note2 in note:
                note_pitch = note2.pitch.midi
                note_duration = int(note.duration.quarterLength*4)
                note_offset = int(note.offset*4 - last_time)
                last_time = note.offset*4
                note_velocity = note2.volume.velocity
                les_tokens.append(NOTE_TOKS[note_pitch])
                les_tokens.append(DUR_TOKS[note_duration])
                les_tokens.append(TIM_TOKS[note_offset])
                les_tokens.append(VEL_TOKS[note_velocity])
                # print("Note Pitch:", note_pitch)
                # print("Note Duration:", note_duration)
                # print("Note Time:", note_offset)
                # print("Note Velocity:", note_velocity)
          


  0%|          | 0/3 [00:00<?, ?it/s]

BennyGoodman_TigerRag-1_FINAL.mid


 33%|███▎      | 1/3 [00:00<00:00,  2.46it/s]

BennyGoodman_TigerRag-2_FINAL.mid


100%|██████████| 3/3 [00:00<00:00,  3.86it/s]

BennyGoodman_Whispering_FINAL.mid


100%|██████████| 3/3 [00:00<00:00,  3.51it/s]


In [25]:
#répartir le data du morceau en blocs de 120 attributs (30 notes)
#Et associer à chaque bloc la réponse attendue (l'attribut suivant)

taille_bloc = 120
les_morceaux = []
les_morceaux_shift = []

for i in range(len(les_tokens)//(taille_bloc+1)-1):
    les_morceaux.append(les_tokens[i:i+taille_bloc])
    les_morceaux_shift.append(les_tokens[i+1:i+1+taille_bloc])


def vectorize(i):
    return [0]*i + [1] + [0]* (len(custom_vocab)-i-1)
def unvectorize(v):
    for i in range(len(v)):
        if v[i]:
            return i

input_vect = [ [vectorize(custom_vocab[tok]) for tok in morceau] for morceau in les_morceaux ]
rep_vect = [ [vectorize(custom_vocab[tok]) for tok in morceau] for morceau in les_morceaux_shift ]

In [26]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
#from transformers import TransformerDecoder, TransformerDecoderLayer

# Define hyperparameters
batch_size = 32
num_epochs = 5
num_tokens = len(custom_vocab)
embed_dim = len(custom_vocab) #la taille du voc car on fait du one hot
num_heads = 283
hidden_dim = 64
dropout = 0.1

# Define the Transformer model
class TransformerModel(torch.nn.Module):
    def __init__(self):
        super(TransformerModel, self).__init__()
        decoder_layer = torch.nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_decoder = torch.nn.TransformerDecoder(decoder_layer, num_layers=6)
        self.decoder = torch.nn.Linear(embed_dim, num_tokens)
        self.init_weights()
        
    def forward(self, x, memory):
        x = x.to(torch.float32)
        x = self.transformer_decoder(x, memory)
        x = self.decoder(x)
        return x
        
    def init_weights(self):
        init_range = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)
        
# Define the dataset
class MyDataset(Dataset):
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data    
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.input_data[idx], dtype=torch.long)
        output_tensor = torch.tensor(self.output_data[idx], dtype=torch.long)
        return input_tensor, output_tensor

# Load the dataset

dataset = MyDataset(input_vect, rep_vect)


# Create a dataloader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model
model = TransformerModel()

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(dataloader):
        inp, rep  = batch
        inp = inp.to(torch.float32)
        rep = rep.to(torch.float32)
        # Create fake encoder outputs
        memory = torch.zeros(batch_size,len(inp[0]), embed_dim).to(inp.device)
        optimizer.zero_grad()
        output = model(inp, memory)
        output_prob = torch.softmax(output, dim=2)
        # Flatten the batch and target tensors for use in the loss function
        output_flat = output[0]

        loss = loss_fn(output_flat, rep[0])
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print("Epoch {:3d}, Batch {:3d}, Loss: {:.4f}".format(epoch, batch_idx, loss.item()))


Epoch   0, Batch   0, Loss: 9.7323
Epoch   1, Batch   0, Loss: 10.3755
Epoch   2, Batch   0, Loss: 9.4902
Epoch   3, Batch   0, Loss: 10.2703
Epoch   4, Batch   0, Loss: 10.0086


In [27]:
example = torch.tensor(input_vect[15], dtype=torch.long).unsqueeze(0) # unsqueeze adds a batch dimension
print(example)


tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]])


In [28]:
custom_vocab[les_morceaux[15][1]]

58

In [29]:
memory = torch.zeros(1, example.size(1), embed_dim)
output = model(example, memory)
predicted_tokens = torch.argmax(output[0], dim=1)
predicted_tokens

tensor([  58,   58,   58,   58,   58,   58,   58,   58,   62,   55,   58,   58,
          58, 1387,   58,   58,   58,   58,   58,   58,   58,   58,   58,   58,
          58, 1385,   62, 1385,   58, 1394,   58,   58,   62,   58,   58,   58,
          58,   58,   58,   58,   58, 1387,   58,   58,   58,   58,   58,  292,
          58,   58,   58,   62,  130,   62,   62,   62,  292,   62,   62,   58,
         292, 1385,  292, 1385,   58, 1394, 1385, 1394,   58,   58,   58,   58,
          58,   58,   58,   58,   62,   58,   58,  292,   58,   58,   62,   58,
          62,   58,   58,   62,   58,   58,   58,   62,   58,   58, 1385,   58,
          58,   58,   58,   58,   58,   62,   58,   58, 1385, 1394,   58,   58,
          58,   58, 1385,   58,   58,   58,   58,   58,   58, 1385,   55,   58])

In [30]:
[itos_vocab[unvectorize(el)] for el in input_vect[15]]

['v116',
 'n59',
 'd4',
 't4',
 'v98',
 'n60',
 'd4',
 't5',
 'v100',
 'n63',
 'd3',
 't4',
 'v104',
 'n63',
 'd1',
 't3',
 'v104',
 'n59',
 'd2',
 't2',
 'v91',
 'n60',
 'd4',
 't3',
 'v98',
 'n63',
 'd4',
 't3',
 'v109',
 'n59',
 'd2',
 't4',
 'v96',
 'n59',
 'd1',
 't2',
 'v96',
 'n60',
 'd4',
 't2',
 'v100',
 'n65',
 'd3',
 't3',
 'v106',
 'n59',
 'd4',
 't4',
 'v98',
 'n60',
 'd2',
 't4',
 'v98',
 'n60',
 'd2',
 't2',
 'v98',
 'n63',
 'd4',
 't2',
 'v101',
 'n53',
 'd3',
 't4',
 'v100',
 'n51',
 'd3',
 't4',
 'v92',
 'n56',
 'd2',
 't4',
 'v99',
 'n56',
 'd4',
 't2',
 'v99',
 'n51',
 'd2',
 't5',
 'v101',
 'n53',
 'd1',
 't3',
 'v86',
 'n56',
 'd2',
 't1',
 'v107',
 'n60',
 'd1',
 't3',
 'v105',
 'n63',
 'd2',
 't1',
 'v105',
 'n63',
 'd2',
 't2',
 'v105',
 'n51',
 'd2',
 't2',
 'v118',
 'n53',
 'd1',
 't2',
 'v98',
 'n56',
 'd2',
 't1',
 'v101',
 'n60',
 'd1',
 't2',
 'v107',
 'n63',
 'd4',
 't1']

In [31]:
[itos_vocab[el] for el in predicted_tokens]

['n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n63',
 'n56',
 'n59',
 'n59',
 'n59',
 'v100',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'v98',
 'n63',
 'v98',
 'n59',
 'v107',
 'n59',
 'n59',
 'n63',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'v100',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 't5',
 'n59',
 'n59',
 'n59',
 'n63',
 'd3',
 'n63',
 'n63',
 'n63',
 't5',
 'n63',
 'n63',
 'n59',
 't5',
 'v98',
 't5',
 'v98',
 'n59',
 'v107',
 'v98',
 'v107',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n63',
 'n59',
 'n59',
 't5',
 'n59',
 'n59',
 'n63',
 'n59',
 'n63',
 'n59',
 'n59',
 'n63',
 'n59',
 'n59',
 'n59',
 'n63',
 'n59',
 'n59',
 'v98',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n63',
 'n59',
 'n59',
 'v98',
 'v107',
 'n59',
 'n59',
 'n59',
 'n59',
 'v98',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'n59',
 'v98',
 'n56',
 'n59']