In [1]:

from copy import deepcopy
from pathlib import Path
from random import shuffle

from evaluate import load as load_metric
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig
from transformers.trainer_utils import set_seed
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df_train = pd.read_csv("dump/train.csv")
df_test = pd.read_csv("dump/test.csv")
df_valid = pd.read_csv("dump/valid.csv")

# Seed
set_seed(777)

# Our tokenizer's configuration
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": BEAT_RES,
    "num_velocities": 24,
    "special_tokens": ["PAD", "BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": True,
    "use_programs": False,  # no multitrack here
    "num_tempos": 32,
    "tempo_range": (50, 200),  # (min_tempo, max_tempo)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens
midi_paths = df_train['file'].to_list()
shuffle(midi_paths)


tokenizer.train(
    vocab_size=30000,
    files_paths=midi_paths[:1000],
)
tokenizer.save_params("tokenizer.json")






  tokenizer.save_params("tokenizer.json")


In [3]:

# Split MIDI paths in train/valid/test sets
midi_paths_valid = df_valid["file"].to_list()
midi_paths_test = df_test["file"].to_list()
midi_paths_train = df_train["file"].to_list()

# Chunk MIDIs and perform data augmentation on each subset independently
for files_paths, subset_name in (
    (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test")
):

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens
    subset_chunks_dir = Path(f"processed_{subset_name}")
    split_files_for_training(
        files_paths=[Path("/Users/suryakant.sahu/Documents/Projects/MyMusicTransformer/" + f) for f in files_paths],
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

    # Perform data augmentation
    augment_dataset(
        subset_chunks_dir,
        pitch_offsets=[-12, 12],
        velocity_offsets=[-4, 4],
        duration_offsets=[-0.5, 0.5],
    )

# Create Dataset and Collator for training
midi_paths_train = list(Path("processed_train").glob("**/*.mid")) + list(Path("processed_train").glob("**/*.midi"))
midi_paths_valid = list(Path("processed_valid").glob("**/*.mid")) + list(Path("processed_valid").glob("**/*.midi"))
midi_paths_test = list(Path("processed_test").glob("**/*.mid")) + list(Path("processed_test").glob("**/*.midi"))
kwargs_dataset = {"max_seq_len": 1024, "tokenizer": tokenizer, "bos_token_id": tokenizer["BOS_None"], "eos_token_id": tokenizer["EOS_None"]}
dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)
dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)
dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)

Splitting music files (processed_train):   0%|          | 0/11427 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Splitting music files (processed_train): 100%|██████████| 11427/11427 [00:34<00:00, 331.71it/s]
Performing data augmentation:   9%|▉         | 34693/378795 [01:52<18:39, 307.37it/s]


KeyboardInterrupt: 

In [None]:
from mingrulm import MinGRULM, MinGRUConfig

config = MinGRUConfig(
    num_tokens=len(tokenizer),  # vocabulary size
    dim=64,
    depth=8,
    ff_mult=2,
    min_gru_expansion=2.0,
    conv_kernel_size=3,
    enable_conv=False
)
model = MinGRULM(config)

In [10]:
metrics = {metric: load_metric(metric) for metric in ["accuracy"]}

def compute_metrics(eval_pred):
    """
    Compute metrics for pretraining.

    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).

    :param eval_pred: EvalPrediction containing predictions and labels
    :return: metrics
    """
    predictions, labels = eval_pred
    not_pad_mask = labels != -100
    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
    return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())

def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
    """
    Preprocess the logits before accumulating them during evaluation.

    This allows to significantly reduce the memory usage and make the training tractable.
    """
    pred_ids = argmax(logits, dim=-1)  # long dtype
    return pred_ids

# Create config for the Trainer
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()
training_config = TrainingArguments(
    "runs", False, True, True, False, "steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=48,
    gradient_accumulation_steps=3,
    eval_accumulation_steps=None,
    eval_steps=1000,
    learning_rate=1e-4,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=20000,
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    fp16_full_eval=FP16_EVAL,
    bf16=BF16,
    bf16_full_eval=BF16_EVAL,
    load_best_model_at_end=True,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    report_to=["tensorboard"],
    gradient_checkpointing=True,
)

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    compute_metrics=compute_metrics,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 5.62MB/s]
max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 16


[2024-10-20 03:06:39,520] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to mps (auto detect)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
W1020 03:06:39.999000 8393030336 torch/distributed/elastic/multiprocessing/redirects.py:28] NOTE: Redirects are currently not supported in Windows or MacOs.
  from google.protobuf import service as _service
***** Running training *****
  Num examples = 378,795
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 3
  Total optimization steps = 20,000
  Number of trainable parameters = 3,988,032
  0%|          | 0/20000 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx,

{'loss': 10.3212, 'grad_norm': 0.12039567530155182, 'learning_rate': 3.3333333333333335e-07, 'epoch': 0.0}


  0%|          | 40/20000 [03:43<28:11:25,  5.08s/it]

{'loss': 10.3215, 'grad_norm': 0.2001177966594696, 'learning_rate': 6.666666666666667e-07, 'epoch': 0.01}


  0%|          | 48/20000 [04:30<31:41:29,  5.72s/it]

KeyboardInterrupt: 