In [10]:
from pathlib import Path
from random import shuffle

from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from transformers import (
    AutoModelForCausalLM,
    GPT2Config,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import set_seed

In [2]:
# Seed
set_seed(777)

# Our tokenizer's configuration
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": BEAT_RES,
    "num_velocities": 24,
    "special_tokens": ["PAD", "BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": True,
    "use_programs": False,  # no multitrack here
    "num_tempos": 32,
    "tempo_range": (50, 200),  # (min_tempo, max_tempo)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens
midi_paths = list(Path("../data/adl-piano-midi").resolve().glob("**/*.mid")) + list(
    Path("../data/adl-piano-midi").resolve().glob("**/*.midi")
)
print(midi_paths)

[WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Ambient/Roger Eno/Sunburst (Album Version).mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Ambient/Roger Eno/While The City Sleeps (Album Version).mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Ambient Psychill/Dhamika/Forever Free.mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Asmr/Factory/Lagt Kort Ligger.mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Asmr/Factory/Paula_ Tva Ar.mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Calming Instrumental/Elisa/Come Speak To Me.mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Calming Instrumental/Elisa/Rainbow.mid'), WindowsPath('C:/Users/mikol/MiniBach/minibach/data/adl-piano-midi/Ambient/Calming Instrumental/Elisa/Tell Me.mid'), WindowsPath('C:/Users/mikol/MiniBach/miniba

In [3]:
tokenizer.train(
    vocab_size=30000,
    files_paths=midi_paths,
)
tokenizer.save_params("tokenizer.json")

  tokenizer.save_params("tokenizer.json")


In [4]:
# Split MIDI paths in train/valid/test sets
total_num_files = len(midi_paths)
num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(midi_paths)
midi_paths_valid = midi_paths[:num_files_valid]
midi_paths_test = midi_paths[num_files_valid : num_files_valid + num_files_test]
midi_paths_train = midi_paths[num_files_valid + num_files_test :]

# Chunk MIDIs and perform data augmentation on each subset independently
for files_paths, subset_name in (
    (midi_paths_train, "train"),
    (midi_paths_valid, "valid"),
    (midi_paths_test, "test"),
):

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens
    subset_chunks_dir = Path(f"Maestro_{subset_name}")
    split_files_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

    # # Perform data augmentation
    # augment_dataset(
    #     subset_chunks_dir,
    #     pitch_offsets=[-12, 12],
    #     velocity_offsets=[-4, 4],
    #     duration_offsets=[-0.5, 0.5],
    # )

# Create Dataset and Collator for training
midi_paths_train = list(Path("Maestro_train").glob("**/*.mid")) + list(
    Path("Maestro_train").glob("**/*.midi")
)
midi_paths_valid = list(Path("Maestro_valid").glob("**/*.mid")) + list(
    Path("Maestro_valid").glob("**/*.midi")
)
midi_paths_test = list(Path("Maestro_test").glob("**/*.mid")) + list(
    Path("Maestro_test").glob("**/*.midi")
)
kwargs_dataset = {
    "max_seq_len": 1024,
    "tokenizer": tokenizer,
    "bos_token_id": tokenizer["BOS_None"],
    "eos_token_id": tokenizer["EOS_None"],
}
dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)
dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)
dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)

Splitting music files (Maestro_train): 100%|██████████| 7754/7754 [00:32<00:00, 239.56it/s]
Splitting music files (Maestro_valid): 100%|██████████| 1661/1661 [00:14<00:00, 112.81it/s]
Splitting music files (Maestro_test): 100%|██████████| 1661/1661 [00:27<00:00, 61.32it/s]


In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [21]:
# model_config = MistralConfig(
#     vocab_size=len(tokenizer),
#     hidden_size=512,
#     intermediate_size=2048,
#     num_hidden_layers=8,
#     num_attention_heads=8,
#     num_key_value_heads=4,
#     sliding_window=256,
#     max_position_embeddings=8192,
#     pad_token_id=tokenizer["PAD_None"],
#     bos_token_id=tokenizer["BOS_None"],
#     eos_token_id=tokenizer["EOS_None"],
# )

model_config = GPT2Config(
    vocab_size=len(tokenizer),
    n_embd=256,
    n_inner=1024,
    n_layer=4,
    n_head=4,
    n_positions=1024,
    n_ctx=1024,
    pad_token_id=tokenizer["PAD_None"],
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

model = AutoModelForCausalLM.from_config(model_config)

Instantiating GPT2LMHeadModel model under default dtype torch.float32.
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0
}



In [19]:
metrics = {metric: load_metric(metric) for metric in ["accuracy"]}


def compute_metrics(eval_pred):
    """
    Compute metrics for pretraining.

    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).

    :param eval_pred: EvalPrediction containing predictions and labels
    :return: metrics
    """
    predictions, labels = eval_pred
    not_pad_mask = labels != -100
    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
    return metrics["accuracy"].compute(
        predictions=predictions.flatten(), references=labels.flatten()
    )


def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
    """
    Preprocess the logits before accumulating them during evaluation.

    This allows to significantly reduce the memory usage and make the training tractable.
    """
    pred_ids = argmax(logits, dim=-1)  # long dtype
    return pred_ids

In [22]:
# Create config for the Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()
training_config = TrainingArguments(
    "runs",
    True,
    True,
    True,
    False,
    "steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=None,
    eval_steps=500,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=10000,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    fp16_full_eval=FP16_EVAL,
    bf16=BF16,
    bf16_full_eval=BF16_EVAL,
    load_best_model_at_end=True,
    label_smoothing_factor=0.0,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    compute_metrics=compute_metrics,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 8
***** Running training *****
  Num examples = 19,142
  Num Epochs = 9
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 10,000
  Number of trainable parameters = 11,101,696


  0%|          | 0/10000 [00:00<?, ?it/s]

{'loss': 10.2824, 'grad_norm': 2.6315062046051025, 'learning_rate': 6.666666666666667e-07, 'epoch': 0.02}
{'loss': 10.2765, 'grad_norm': 2.503431558609009, 'learning_rate': 1.3333333333333334e-06, 'epoch': 0.03}
{'loss': 10.2832, 'grad_norm': 2.53613543510437, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 10.2768, 'grad_norm': 2.238269567489624, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.07}
{'loss': 10.2663, 'grad_norm': 2.315215587615967, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.08}
{'loss': 10.2537, 'grad_norm': 2.355879783630371, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'loss': 10.2437, 'grad_norm': 2.255587577819824, 'learning_rate': 4.666666666666667e-06, 'epoch': 0.12}
{'loss': 10.2244, 'grad_norm': 1.6200621128082275, 'learning_rate': 5.333333333333334e-06, 'epoch': 0.13}
{'loss': 10.2059, 'grad_norm': 1.5989787578582764, 'learning_rate': 6e-06, 'epoch': 0.15}
{'loss': 10.196, 'grad_norm': 1.665369987487793, 'learning_rate': 6


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 9.7543, 'grad_norm': 1.3393068313598633, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.42}


  0%|          | 0/849 [00:00<?, ?it/s]

{'eval_loss': 9.734908103942871, 'eval_accuracy': 0.021322979164752065, 'eval_runtime': 177.3796, 'eval_samples_per_second': 76.508, 'eval_steps_per_second': 4.786, 'epoch': 0.42}
{'loss': 9.7344, 'grad_norm': 1.2173593044281006, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.43}
{'loss': 9.6943, 'grad_norm': 1.2339277267456055, 'learning_rate': 1.8e-05, 'epoch': 0.45}
{'loss': 9.6318, 'grad_norm': 1.3974556922912598, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.47}
{'loss': 9.608, 'grad_norm': 1.4609849452972412, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.48}
{'loss': 9.5645, 'grad_norm': 1.3278007507324219, 'learning_rate': 2e-05, 'epoch': 0.5}
{'loss': 9.5475, 'grad_norm': 1.3203685283660889, 'learning_rate': 2.0666666666666666e-05, 'epoch': 0.52}
{'loss': 9.5203, 'grad_norm': 1.120071530342102, 'learning_rate': 2.1333333333333335e-05, 'epoch': 0.53}
{'loss': 9.4909, 'grad_norm': 1.2433277368545532, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.55}
{'loss':


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 9.1617, 'grad_norm': 1.0239696502685547, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.84}


  0%|          | 0/849 [00:00<?, ?it/s]

Saving model checkpoint to runs\checkpoint-1000
Configuration saved in runs\checkpoint-1000\config.json
Configuration saved in runs\checkpoint-1000\generation_config.json
Model weights saved in runs\checkpoint-1000\model.safetensors


{'eval_loss': 9.160441398620605, 'eval_accuracy': 0.009121492995838202, 'eval_runtime': 376.046, 'eval_samples_per_second': 36.089, 'eval_steps_per_second': 2.258, 'epoch': 0.84}
{'loss': 9.1025, 'grad_norm': 1.0296344757080078, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.85}
{'loss': 9.1302, 'grad_norm': 2.4442877769470215, 'learning_rate': 3.466666666666667e-05, 'epoch': 0.87}
{'loss': 9.1058, 'grad_norm': 1.096023440361023, 'learning_rate': 3.5333333333333336e-05, 'epoch': 0.89}
{'loss': 9.1534, 'grad_norm': 1.2286454439163208, 'learning_rate': 3.6e-05, 'epoch': 0.9}
{'loss': 9.1309, 'grad_norm': 0.9464104771614075, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.92}
{'loss': 9.0836, 'grad_norm': 1.2241411209106445, 'learning_rate': 3.733333333333334e-05, 'epoch': 0.94}
{'loss': 9.0855, 'grad_norm': 0.9449096322059631, 'learning_rate': 3.8e-05, 'epoch': 0.95}
{'loss': 9.0926, 'grad_norm': 1.1322864294052124, 'learning_rate': 3.866666666666667e-05, 'epoch': 0.97}
{'loss':


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 8.9355, 'grad_norm': 1.727672815322876, 'learning_rate': 5e-05, 'epoch': 1.25}


  0%|          | 0/849 [00:00<?, ?it/s]

{'eval_loss': 8.974980354309082, 'eval_accuracy': 0.031133072509813703, 'eval_runtime': 396.5176, 'eval_samples_per_second': 34.225, 'eval_steps_per_second': 2.141, 'epoch': 1.25}
{'loss': 8.9223, 'grad_norm': 1.5812466144561768, 'learning_rate': 5.0666666666666674e-05, 'epoch': 1.27}
{'loss': 8.9131, 'grad_norm': 1.3266043663024902, 'learning_rate': 5.133333333333333e-05, 'epoch': 1.29}
{'loss': 8.9204, 'grad_norm': 1.7995704412460327, 'learning_rate': 5.2000000000000004e-05, 'epoch': 1.3}
{'loss': 8.9107, 'grad_norm': 1.875595211982727, 'learning_rate': 5.266666666666666e-05, 'epoch': 1.32}
{'loss': 8.9188, 'grad_norm': 1.7334181070327759, 'learning_rate': 5.333333333333333e-05, 'epoch': 1.34}
{'loss': 8.8751, 'grad_norm': 1.339880108833313, 'learning_rate': 5.4000000000000005e-05, 'epoch': 1.35}
{'loss': 8.8973, 'grad_norm': 1.6172689199447632, 'learning_rate': 5.466666666666666e-05, 'epoch': 1.37}
{'loss': 8.8432, 'grad_norm': 1.8075425624847412, 'learning_rate': 5.5333333333333334


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 8.5005, 'grad_norm': 1.8087491989135742, 'learning_rate': 6.666666666666667e-05, 'epoch': 1.67}


  0%|          | 0/849 [00:00<?, ?it/s]

Saving model checkpoint to runs\checkpoint-2000
Configuration saved in runs\checkpoint-2000\config.json
Configuration saved in runs\checkpoint-2000\generation_config.json
Model weights saved in runs\checkpoint-2000\model.safetensors


{'eval_loss': 8.480177879333496, 'eval_accuracy': 0.021099134818627, 'eval_runtime': 339.4989, 'eval_samples_per_second': 39.974, 'eval_steps_per_second': 2.501, 'epoch': 1.67}
{'loss': 8.4474, 'grad_norm': 2.296853542327881, 'learning_rate': 6.733333333333333e-05, 'epoch': 1.69}
{'loss': 8.4189, 'grad_norm': 2.3037753105163574, 'learning_rate': 6.800000000000001e-05, 'epoch': 1.7}
{'loss': 8.4052, 'grad_norm': 2.638603687286377, 'learning_rate': 6.866666666666666e-05, 'epoch': 1.72}
{'loss': 8.386, 'grad_norm': 1.899048089981079, 'learning_rate': 6.933333333333334e-05, 'epoch': 1.74}
{'loss': 8.3561, 'grad_norm': 2.1889007091522217, 'learning_rate': 7e-05, 'epoch': 1.76}
{'loss': 8.2385, 'grad_norm': 2.670670509338379, 'learning_rate': 7.066666666666667e-05, 'epoch': 1.77}
{'loss': 8.2293, 'grad_norm': 2.2041068077087402, 'learning_rate': 7.133333333333334e-05, 'epoch': 1.79}
{'loss': 8.2357, 'grad_norm': 2.124166965484619, 'learning_rate': 7.2e-05, 'epoch': 1.81}
{'loss': 8.1866, 'gr


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 7.7522, 'grad_norm': 2.4026150703430176, 'learning_rate': 8.333333333333334e-05, 'epoch': 2.09}


  0%|          | 0/849 [00:00<?, ?it/s]

{'eval_loss': 7.749161720275879, 'eval_accuracy': 0.009978797148446218, 'eval_runtime': 339.4268, 'eval_samples_per_second': 39.982, 'eval_steps_per_second': 2.501, 'epoch': 2.09}
{'loss': 7.7531, 'grad_norm': 2.6129350662231445, 'learning_rate': 8.4e-05, 'epoch': 2.11}
{'loss': 7.6865, 'grad_norm': 2.821141242980957, 'learning_rate': 8.466666666666667e-05, 'epoch': 2.12}
{'loss': 7.6704, 'grad_norm': 2.509450674057007, 'learning_rate': 8.533333333333334e-05, 'epoch': 2.14}
{'loss': 7.6208, 'grad_norm': 3.3153576850891113, 'learning_rate': 8.6e-05, 'epoch': 2.16}
{'loss': 7.6065, 'grad_norm': 2.715280055999756, 'learning_rate': 8.666666666666667e-05, 'epoch': 2.17}
{'loss': 7.5764, 'grad_norm': 2.910350799560547, 'learning_rate': 8.733333333333333e-05, 'epoch': 2.19}
{'loss': 7.5949, 'grad_norm': 2.6454567909240723, 'learning_rate': 8.800000000000001e-05, 'epoch': 2.21}
{'loss': 7.5747, 'grad_norm': 2.8833394050598145, 'learning_rate': 8.866666666666668e-05, 'epoch': 2.22}
{'loss': 7.5


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 7.2176, 'grad_norm': 3.101234197616577, 'learning_rate': 0.0001, 'epoch': 2.51}


  0%|          | 0/849 [00:00<?, ?it/s]

Saving model checkpoint to runs\checkpoint-3000
Configuration saved in runs\checkpoint-3000\config.json
Configuration saved in runs\checkpoint-3000\generation_config.json
Model weights saved in runs\checkpoint-3000\model.safetensors


{'eval_loss': 7.18705940246582, 'eval_accuracy': 0.007620554293741548, 'eval_runtime': 344.4665, 'eval_samples_per_second': 39.397, 'eval_steps_per_second': 2.465, 'epoch': 2.51}
{'loss': 7.1452, 'grad_norm': 4.153716564178467, 'learning_rate': 9.999798580854356e-05, 'epoch': 2.52}
{'loss': 7.1163, 'grad_norm': 2.924123525619507, 'learning_rate': 9.999194339645292e-05, 'epoch': 2.54}
{'loss': 7.1145, 'grad_norm': 2.855323076248169, 'learning_rate': 9.998187325055106e-05, 'epoch': 2.56}
{'loss': 7.1297, 'grad_norm': 3.890458583831787, 'learning_rate': 9.996777618216607e-05, 'epoch': 2.57}
{'loss': 7.0961, 'grad_norm': 3.3591866493225098, 'learning_rate': 9.994965332706573e-05, 'epoch': 2.59}
{'loss': 7.11, 'grad_norm': 3.654918909072876, 'learning_rate': 9.992750614536605e-05, 'epoch': 2.61}
{'loss': 7.0952, 'grad_norm': 3.5988922119140625, 'learning_rate': 9.990133642141359e-05, 'epoch': 2.62}
{'loss': 7.088, 'grad_norm': 3.6090259552001953, 'learning_rate': 9.987114626364171e-05, 'epo


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 6.7324, 'grad_norm': 3.691117286682129, 'learning_rate': 9.874639560909117e-05, 'epoch': 2.93}


  0%|          | 0/849 [00:00<?, ?it/s]

{'eval_loss': 6.82030725479126, 'eval_accuracy': 0.007703265107852276, 'eval_runtime': 351.4954, 'eval_samples_per_second': 38.609, 'eval_steps_per_second': 2.415, 'epoch': 2.93}
{'loss': 6.783, 'grad_norm': 3.641303062438965, 'learning_rate': 9.864456609700726e-05, 'epoch': 2.94}
{'loss': 6.7488, 'grad_norm': 3.6003551483154297, 'learning_rate': 9.853881740614591e-05, 'epoch': 2.96}
{'loss': 6.7442, 'grad_norm': 3.55020809173584, 'learning_rate': 9.842915805643155e-05, 'epoch': 2.98}
{'loss': 6.7556, 'grad_norm': 3.3639769554138184, 'learning_rate': 9.831559688286121e-05, 'epoch': 2.99}
{'loss': 6.7046, 'grad_norm': 3.1689882278442383, 'learning_rate': 9.819814303479267e-05, 'epoch': 3.01}
{'loss': 6.6575, 'grad_norm': 2.9262936115264893, 'learning_rate': 9.807680597520746e-05, 'epoch': 3.03}
{'loss': 6.6095, 'grad_norm': 3.57114577293396, 'learning_rate': 9.79515954799483e-05, 'epoch': 3.04}
{'loss': 6.6505, 'grad_norm': 3.7331273555755615, 'learning_rate': 9.782252163693158e-05, 'ep


***** Running Evaluation *****
  Num examples = 13571
  Batch size = 16


{'loss': 6.4937, 'grad_norm': 3.1996209621429443, 'learning_rate': 9.504844339512095e-05, 'epoch': 3.34}


  0%|          | 0/849 [00:00<?, ?it/s]