In [1]:
from pathlib import Path
from random import shuffle

# from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from transformers import (
    AutoModelForCausalLM,
    GPT2Config,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_utils import set_seed




In [2]:
# Seed
set_seed(777)

# Our tokenizer's configuration
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": BEAT_RES,
    "num_velocities": 24,
    "special_tokens": ["PAD", "BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": False,
    "use_time_signatures": False,
    "use_programs": False,  # no multitrack here
    "num_tempos": 32,
    "tempo_range": (50, 200),  # (min_tempo, max_tempo)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

file_name = "filtered-midi-files"
# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens
midi_paths = list(
    Path("../data/filtered-midi-files").resolve().glob("**/*.mid")
) + list(Path("../data/filtered-midi-files").resolve().glob("**/*.midi"))
print(midi_paths)

[WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/(Sittin On) The Dock Of The Bay.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/American Beauty.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Boogie Woogie Santa.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Breathing.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Cascades.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Chicago Breakdown.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Cleanin Up Christmas.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Cleopha.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/minibach/data/filtered-midi-files/Blues/Combination March.mid'), WindowsPath('C:/Users/mikol/mini/MiniBach/mini

In [12]:
# tokenizer.train(
#     vocab_size=20000,
#     files_paths=midi_paths,
# )
# tokenizer.save_params("tokenizer_filtered_reduced_half.json")

tokenizer_path = Path("tokenizer_filtered.json")
tokenizer = REMI(params=tokenizer_path)

In [4]:
print(len(tokenizer))

20000


In [13]:
# Split MIDI paths in train/valid/test sets
total_num_files = len(midi_paths)
num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(midi_paths)
midi_paths_valid = midi_paths[:num_files_valid]
midi_paths_test = midi_paths[num_files_valid : num_files_valid + num_files_test]
midi_paths_train = midi_paths[num_files_valid + num_files_test :]

# # Chunk MIDIs and perform data augmentation on each subset independently
for files_paths, subset_name in (
    (midi_paths_train, "train"),
    (midi_paths_valid, "valid"),
    (midi_paths_test, "test"),
):

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens
    subset_chunks_dir = Path(f"filtered_midi/Maestro_{subset_name}")
    split_files_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

# # Perform data augmentation
# augment_dataset(
#     subset_chunks_dir,
#     pitch_offsets=[-12, 12],
#     velocity_offsets=[-4, 4],
#     duration_offsets=[-0.5, 0.5],
# )

Splitting music files (filtered_midi\Maestro_train): 100%|██████████| 6145/6145 [00:24<00:00, 250.29it/s]
Splitting music files (filtered_midi\Maestro_valid): 100%|██████████| 1317/1317 [00:21<00:00, 61.33it/s]
Splitting music files (filtered_midi\Maestro_test): 100%|██████████| 1317/1317 [00:10<00:00, 123.62it/s]


In [14]:
dir_name = "filtered_midi"
# Create Dataset and Collator for training
midi_paths_train = list(Path(f"{dir_name}/Maestro_train").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_train").glob("**/*.midi")
)
midi_paths_valid = list(Path(f"{dir_name}/Maestro_valid").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_valid").glob("**/*.midi")
)
midi_paths_test = list(Path(f"{dir_name}/Maestro_test").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_test").glob("**/*.midi")
)
kwargs_dataset = {
    "max_seq_len": 1024,
    "tokenizer": tokenizer,
    "bos_token_id": tokenizer["BOS_None"],
    "eos_token_id": tokenizer["EOS_None"],
}
dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)
dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)
dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)

In [6]:
print(dataset_train)

46056 files.


In [7]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [15]:
from transformers import GPT2LMHeadModel
# # Normal trening
# model_config = GPT2Config(
#     vocab_size=len(tokenizer),
#     n_embd=768,
#     n_inner=1024,
#     n_layer=12,
#     n_head=12,
#     n_positions=2048,
#     n_ctx=1024,
#     pad_token_id=tokenizer["PAD_None"],
#     bos_token_id=tokenizer["BOS_None"],
#     eos_token_id=tokenizer["EOS_None"],
# )

# model = AutoModelForCausalLM.from_config(model_config)

# Fine Tuning

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

loading configuration file config.json from cache at C:\Users\mikol\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.46.2",
  

Embedding(30000, 768)

In [10]:
from evaluate import load as load_metric

metrics = {metric: load_metric(metric) for metric in ["accuracy"]}


def compute_metrics(eval_pred):
    """
    Compute metrics for pretraining.

    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).

    :param eval_pred: EvalPrediction containing predictions and labels
    :return: metrics
    """
    predictions, labels = eval_pred
    not_pad_mask = labels != -100
    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
    return metrics["accuracy"].compute(
        predictions=predictions.flatten(), references=labels.flatten()
    )


def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
    """
    Preprocess the logits before accumulating them during evaluation.

    This allows to significantly reduce the memory usage and make the training tractable.
    """
    pred_ids = argmax(logits, dim=-1)  # long dtype
    return pred_ids

In [13]:
# Create config for the Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()
training_config = TrainingArguments(
    "filtered_midi",
    True,
    True,
    False,
    False,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=10000,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    bf16=BF16,
    label_smoothing_factor=0.0,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=dataset_train,
    compute_metrics=compute_metrics,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 4
***** Running training *****
  Num examples = 46,056
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 10,000
  Number of trainable parameters = 64,215,552


  0%|          | 0/10000 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 10.0485, 'grad_norm': 25.025156021118164, 'learning_rate': 6.666666666666667e-07, 'epoch': 0.0}
{'loss': 10.0058, 'grad_norm': 19.512693405151367, 'learning_rate': 1.3333333333333334e-06, 'epoch': 0.01}
{'loss': 9.9395, 'grad_norm': 22.386886596679688, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 9.8753, 'grad_norm': 19.597890853881836, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.01}
{'loss': 9.7999, 'grad_norm': 12.81788444519043, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.02}
{'loss': 9.7506, 'grad_norm': 11.897374153137207, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 9.6807, 'grad_norm': 10.399171829223633, 'learning_rate': 4.666666666666667e-06, 'epoch': 0.02}
{'loss': 9.6649, 'grad_norm': 8.415386199951172, 'learning_rate': 5.333333333333334e-06, 'epoch': 0.03}
{'loss': 9.5973, 'grad_norm': 8.56799602508545, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 9.527, 'grad_norm': 10.134136199951172, 'learning_rate': 6.666

Saving model checkpoint to filtered_midi\checkpoint-2000
Configuration saved in filtered_midi\checkpoint-2000\config.json
Configuration saved in filtered_midi\checkpoint-2000\generation_config.json


{'loss': 7.1849, 'grad_norm': 6.470387935638428, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.35}


Model weights saved in filtered_midi\checkpoint-2000\model.safetensors
Deleting older checkpoint [filtered_midi\checkpoint-2000] due to args.save_total_limit


{'loss': 7.1283, 'grad_norm': 5.50190544128418, 'learning_rate': 6.733333333333333e-05, 'epoch': 0.35}
{'loss': 7.1077, 'grad_norm': 4.157002925872803, 'learning_rate': 6.800000000000001e-05, 'epoch': 0.35}
{'loss': 7.1385, 'grad_norm': 6.464125633239746, 'learning_rate': 6.866666666666666e-05, 'epoch': 0.36}
{'loss': 7.0304, 'grad_norm': 5.405418872833252, 'learning_rate': 6.933333333333334e-05, 'epoch': 0.36}
{'loss': 7.1043, 'grad_norm': 6.217262268066406, 'learning_rate': 7e-05, 'epoch': 0.36}
{'loss': 6.9987, 'grad_norm': 7.270103931427002, 'learning_rate': 7.066666666666667e-05, 'epoch': 0.37}
{'loss': 7.0678, 'grad_norm': 5.04953145980835, 'learning_rate': 7.133333333333334e-05, 'epoch': 0.37}
{'loss': 6.9554, 'grad_norm': 6.9713215827941895, 'learning_rate': 7.2e-05, 'epoch': 0.38}
{'loss': 6.9653, 'grad_norm': 5.7055840492248535, 'learning_rate': 7.266666666666667e-05, 'epoch': 0.38}
{'loss': 6.914, 'grad_norm': 5.466586112976074, 'learning_rate': 7.333333333333333e-05, 'epoch

Saving model checkpoint to filtered_midi\checkpoint-4000
Configuration saved in filtered_midi\checkpoint-4000\config.json
Configuration saved in filtered_midi\checkpoint-4000\generation_config.json


{'loss': 5.89, 'grad_norm': 4.433174133300781, 'learning_rate': 9.504844339512095e-05, 'epoch': 0.69}


Model weights saved in filtered_midi\checkpoint-4000\model.safetensors
Deleting older checkpoint [filtered_midi\checkpoint-4000] due to args.save_total_limit


{'loss': 5.7961, 'grad_norm': 4.630579948425293, 'learning_rate': 9.485190471934843e-05, 'epoch': 0.7}
{'loss': 5.8439, 'grad_norm': 4.28175163269043, 'learning_rate': 9.465175243064428e-05, 'epoch': 0.7}
{'loss': 5.6735, 'grad_norm': 4.182397842407227, 'learning_rate': 9.444800265480967e-05, 'epoch': 0.71}
{'loss': 5.7532, 'grad_norm': 4.083531379699707, 'learning_rate': 9.424067180748692e-05, 'epoch': 0.71}
{'loss': 5.6047, 'grad_norm': 4.759305477142334, 'learning_rate': 9.40297765928369e-05, 'epoch': 0.71}
{'loss': 5.7215, 'grad_norm': 6.451472759246826, 'learning_rate': 9.381533400219318e-05, 'epoch': 0.72}
{'loss': 5.7148, 'grad_norm': 4.581519603729248, 'learning_rate': 9.359736131269312e-05, 'epoch': 0.72}
{'loss': 5.6244, 'grad_norm': 7.41244649887085, 'learning_rate': 9.337587608588588e-05, 'epoch': 0.72}
{'loss': 5.6267, 'grad_norm': 4.687442302703857, 'learning_rate': 9.315089616631752e-05, 'epoch': 0.73}
{'loss': 5.5528, 'grad_norm': 5.297405242919922, 'learning_rate': 9.2

Saving model checkpoint to filtered_midi\checkpoint-6000
Configuration saved in filtered_midi\checkpoint-6000\config.json
Configuration saved in filtered_midi\checkpoint-6000\generation_config.json


{'loss': 4.7894, 'grad_norm': 5.431129455566406, 'learning_rate': 6.112604669781572e-05, 'epoch': 1.04}


Model weights saved in filtered_midi\checkpoint-6000\model.safetensors


{'loss': 4.899, 'grad_norm': 4.5181403160095215, 'learning_rate': 6.068805774960573e-05, 'epoch': 1.05}
{'loss': 4.8406, 'grad_norm': 5.7438063621521, 'learning_rate': 6.0249207689611533e-05, 'epoch': 1.05}
{'loss': 5.1011, 'grad_norm': 4.073493957519531, 'learning_rate': 5.980953187495476e-05, 'epoch': 1.05}
{'loss': 4.9021, 'grad_norm': 5.521363735198975, 'learning_rate': 5.9369065729286245e-05, 'epoch': 1.06}
{'loss': 5.0411, 'grad_norm': 5.327245712280273, 'learning_rate': 5.8927844739931834e-05, 'epoch': 1.06}
{'loss': 4.7589, 'grad_norm': 5.215595722198486, 'learning_rate': 5.8485904455033444e-05, 'epoch': 1.06}
{'loss': 4.8315, 'grad_norm': 4.041055679321289, 'learning_rate': 5.804328048068492e-05, 'epoch': 1.07}
{'loss': 4.8796, 'grad_norm': 4.450559139251709, 'learning_rate': 5.760000847806337e-05, 'epoch': 1.07}
{'loss': 4.8471, 'grad_norm': 5.182246685028076, 'learning_rate': 5.715612416055598e-05, 'epoch': 1.07}
{'loss': 4.8701, 'grad_norm': 4.736745834350586, 'learning_rat

Saving model checkpoint to filtered_midi\checkpoint-8000
Configuration saved in filtered_midi\checkpoint-8000\config.json
Configuration saved in filtered_midi\checkpoint-8000\generation_config.json


{'loss': 4.4514, 'grad_norm': 4.843384265899658, 'learning_rate': 1.8825509907063327e-05, 'epoch': 1.39}


Model weights saved in filtered_midi\checkpoint-8000\model.safetensors


{'loss': 4.5537, 'grad_norm': 5.448801040649414, 'learning_rate': 1.8475885297764305e-05, 'epoch': 1.39}
{'loss': 4.5714, 'grad_norm': 4.638421535491943, 'learning_rate': 1.8128800512565513e-05, 'epoch': 1.4}
{'loss': 4.3439, 'grad_norm': 5.697246074676514, 'learning_rate': 1.778428351527529e-05, 'epoch': 1.4}
{'loss': 4.4374, 'grad_norm': 4.34145975112915, 'learning_rate': 1.744236206282132e-05, 'epoch': 1.4}
{'loss': 4.5475, 'grad_norm': 4.651174545288086, 'learning_rate': 1.7103063703014376e-05, 'epoch': 1.41}
{'loss': 4.365, 'grad_norm': 5.79328727722168, 'learning_rate': 1.676641577232873e-05, 'epoch': 1.41}
{'loss': 4.5507, 'grad_norm': 4.700215816497803, 'learning_rate': 1.64324453936998e-05, 'epoch': 1.41}
{'loss': 4.3927, 'grad_norm': 5.619043350219727, 'learning_rate': 1.610117947433897e-05, 'epoch': 1.42}
{'loss': 4.4566, 'grad_norm': 4.775585651397705, 'learning_rate': 1.5772644703565565e-05, 'epoch': 1.42}
{'loss': 4.5112, 'grad_norm': 4.7405500411987305, 'learning_rate': 

Saving model checkpoint to filtered_midi\checkpoint-10000
Configuration saved in filtered_midi\checkpoint-10000\config.json
Configuration saved in filtered_midi\checkpoint-10000\generation_config.json
Model weights saved in filtered_midi\checkpoint-10000\model.safetensors


{'loss': 4.2519, 'grad_norm': 4.2653913497924805, 'learning_rate': 0.0, 'epoch': 1.74}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to filtered_midi
Configuration saved in filtered_midi\config.json
Configuration saved in filtered_midi\generation_config.json


{'train_runtime': 8582.5155, 'train_samples_per_second': 9.321, 'train_steps_per_second': 1.165, 'train_loss': 5.83156932220459, 'epoch': 1.74}


Model weights saved in filtered_midi\model.safetensors


***** train metrics *****
  epoch                    =      1.737
  total_flos               = 12077560GF
  train_loss               =     5.8316
  train_runtime            = 2:23:02.51
  train_samples_per_second =      9.321
  train_steps_per_second   =      1.165


# Fine Tuning

In [16]:
# Create config for the Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()
training_config = TrainingArguments(
    "fine_tuned",
    True,
    True,
    False,
    False,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=10000,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    bf16=BF16,
    label_smoothing_factor=0.0,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=dataset_train,
)

# Training
train_result = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 4
***** Running training *****
  Num examples = 50,751
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 10,000
  Number of trainable parameters = 108,882,432


  0%|          | 0/10000 [00:00<?, ?it/s]

{'loss': 8.2509, 'grad_norm': 22.95566749572754, 'learning_rate': 6.666666666666667e-07, 'epoch': 0.0}
{'loss': 8.0973, 'grad_norm': 21.953716278076172, 'learning_rate': 1.3333333333333334e-06, 'epoch': 0.01}
{'loss': 8.0982, 'grad_norm': 17.18205451965332, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}


KeyboardInterrupt: 