In [1]:
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from transformers import DataCollatorForLanguageModeling
from miditok.utils import split_files_for_training
from torch.utils.data import DataLoader
from pathlib import Path
from random import shuffle

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/maximos/miniconda3/envs/torch/lib/python3.12/site-packages

In [2]:
# Load the tokenizer
prefix = '/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis'
saved_tokenizer_path = f'{prefix}_REMI_BPE_tokenizer.json'
path_to_dataset = prefix
path_to_train_splits = f'{prefix}_splits_REMI_BPE/train/midis'
path_to_valid_splits = f'{prefix}_splits_REMI_BPE/valid/midis'

max_seq_len = 1024

tokenizer = REMI(params=Path(saved_tokenizer_path))

tokenizer.pad_token = tokenizer.special_tokens[0]
tokenizer.mask_token = tokenizer.special_tokens[1]

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15
)

In [3]:
print(tokenizer)

10000 tokens with ('I', 'T') io format, trained with BPE


In [4]:
tokenizer.pad_token = tokenizer.special_tokens[0]

In [5]:
tokenizer.special_tokens

['PAD_None', 'MASK_None', 'BOS_None', 'EOS_None']

In [6]:
files_paths = list(Path(path_to_dataset).glob("**/*.mid"))
shuffle(files_paths)

In [7]:
total_num_files = len(files_paths)
num_files_valid = round(total_num_files * 0.10)

In [8]:
print(total_num_files, num_files_valid)

159141 15914


In [9]:
midi_paths_valid = files_paths[:num_files_valid]
midi_paths_train = files_paths[num_files_valid:]

In [10]:
# Split MIDIs into smaller chunks for validation
dataset_chunks_dir = Path(path_to_valid_splits)
split_files_for_training(
    files_paths=midi_paths_valid,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=max_seq_len,
)

Splitting music files (/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis): 100%|██████████| 15914/15914 [00:40<00:00, 388.53it/s]


[PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Benjamin, Arthur, Forest Peace, lQAut6zwn5k#p-4_0.mid'),
 PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Benjamin, Arthur, Forest Peace, lQAut6zwn5k#p-4_1.mid'),
 PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Arensky, Anton, 4 Etudes, Op.41, 0VqP0SKq5kY#p-3_0.mid'),
 PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Arensky, Anton, 4 Etudes, Op.41, 0VqP0SKq5kY#p-3_1.mid'),
 PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Vrangel, Vasily, 6 Pieces, Op.1, H_hMS16cVfA#p2_0.mid'),
 PosixPath('/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/valid/midis/Vrangel, Vasily, 6 Pieces, Op.1, H_hMS16cVfA#p2_1.mid'),
 PosixPath('/media/m

In [11]:

# Split MIDIs into smaller chunks for training
dataset_chunks_dir = Path(path_to_train_splits)
split_files_for_training(
    files_paths=midi_paths_train,
    tokenizer=tokenizer,
    save_dir=dataset_chunks_dir,
    max_seq_len=max_seq_len,
)

Splitting music files (/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis): 100%|██████████| 143227/143227 [06:16<00:00, 380.05it/s]


[PosixPath("/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis/Czerny, Carl, Impromptu sentimental sur 'O nume benefico', Op.523, pz1lJvlddr4#v-4_0.mid"),
 PosixPath("/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis/Czerny, Carl, Impromptu sentimental sur 'O nume benefico', Op.523, pz1lJvlddr4#v-4_1.mid"),
 PosixPath("/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis/Czerny, Carl, Impromptu sentimental sur 'O nume benefico', Op.523, pz1lJvlddr4#v-4_2.mid"),
 PosixPath("/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis/Czerny, Carl, Impromptu sentimental sur 'O nume benefico', Op.523, pz1lJvlddr4#v-4_3.mid"),
 PosixPath("/media/maindisk/maximos/data/GiantMIDI-PIano/midis_v1.2/aug/midis_splits_REMI_BPE/train/midis/Czerny, Carl, Impromptu sentimental sur 'O nume benefico', Op.523, pz1lJvlddr4#v-4_4.mid"),
 PosixPath

In [12]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

In [13]:
train_dataset = DatasetMIDI(
    files_paths=Path(path_to_train_splits).glob('**/*.mid'),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
valid_dataset = DatasetMIDI(
    files_paths=Path(path_to_valid_splits).glob('**/*.mid'),
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(pad_token_id=tokenizer.pad_token_id, copy_inputs_as_labels=True)
# dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)


In [14]:
print(train_dataset[0])

{'input_ids': tensor([   2, 1312,  185,  ..., 2052, 8328,  217])}
