In [1]:
from pathlib import Path
from random import shuffle

# from evaluate import load as load_metric
from miditok import REMI
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from transformers.trainer_utils import set_seed
import sys
import os

# Pobranie ścieżki do folderu głównego projektu (dostosuj, jeśli to konieczne)
project_path = os.path.abspath("../minGPT_")
# Dodanie tej ścieżki do `sys.path`
if project_path not in sys.path:
    sys.path.append(project_path)

# Teraz powinno działać
from minGPT_.projects.midi.midi import MidiDataset

In [2]:
# Split MIDI paths in train/valid/test sets
total_num_files = len(midi_paths)
num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(midi_paths)
midi_paths_valid = midi_paths[:num_files_valid]
midi_paths_test = midi_paths[num_files_valid : num_files_valid + num_files_test]
midi_paths_train = midi_paths[num_files_valid + num_files_test :]

# # Chunk MIDIs and perform data augmentation on each subset independently
for files_paths, subset_name in (
    (midi_paths_train, "train"),
    (midi_paths_valid, "valid"),
    (midi_paths_test, "test"),
):

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens
    subset_chunks_dir = Path(f"filtered_midi/aug/Maestro_{subset_name}")
    split_files_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

# Perform data augmentation
augment_dataset(
    subset_chunks_dir,
    pitch_offsets=[-12, 12],
    velocity_offsets=[-4, 4],
    duration_offsets=[-0.5, 0.5],
)

NameError: name 'midi_paths' is not defined

# Loading tokenizer

In [2]:
tokenizer_path = Path("tokenizer_filtered.json")
tokenizer = REMI(params=tokenizer_path)

pad_token = tokenizer["PAD_None"]
tokenizer_path

WindowsPath('tokenizer_filtered.json')

# Loading datasets

In [3]:
dir_name = "filtered_midi"

# Create Dataset and Collator for training

midi_paths_train = list(Path(f"{dir_name}/Maestro_train").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_train").glob("**/*.midi")
)

midi_paths_valid = list(Path(f"{dir_name}/Maestro_valid").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_valid").glob("**/*.midi")
)

midi_paths_test = list(Path(f"{dir_name}/Maestro_test").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_test").glob("**/*.midi")
)

kwargs_dataset = {
    "max_seq_len": 1024,
    "tokenizer": tokenizer,
    "bos_token_id": tokenizer["BOS_None"],
    "eos_token_id": tokenizer["EOS_None"],
}

dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)

dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)

dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)
print(midi_paths_train[0])

filtered_midi\Maestro_train\Blues\(Sittin On) The Dock Of The Bay_0.mid


In [4]:
tran_config = {"max_seq_len": 1024, "pad_token_id": pad_token, "pred_num": 1}

train_dataset = MidiDataset(dataset_train, **tran_config)
valid_dataset = MidiDataset(dataset_valid, **tran_config)
test_dataset = MidiDataset(dataset_test, **tran_config)

# Setting params

In [5]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = "gpt-nano"
# model_config.model_type = "gpt-micro"
model_config.model_type = "gpt-mini"

model_config.vocab_size = len(tokenizer)
model_config.block_size = 1024


model = GPT(model_config)

number of parameters: 8.63M


In [6]:
print(model_config.n_head, model_config.n_layer, model_config.n_embd)

6 6 192


In [7]:
import torch
from mingpt.trainer import Trainer


torch.cuda.empty_cache()

train_config = Trainer.get_default_config()

train_config.learning_rate = (
    2e-5  # the model we're using is so small that we can go a bit faster
)
train_config.batch_size = 4
train_config.max_iters = 10000
train_config.num_workers = 4
train_config.weight_decay = 0.01
train_config.lr_decay = True
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


# Training

In [8]:
import json
import torch

losses = []
def model_info_to_json(model_name):
    (path := Path("eval")).mkdir(exist_ok=True)
    
    path_json_l = f"{path}/losses.json"
    path_json_c = f"{path}/config.json"
   
    json.dump(losses, path_json_l)
    json.dump(train_config, open(path_json_c, "w"))
    


def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        losses.append(trainer.loss.item())
    
trainer.set_callback("on_batch_end", batch_end_callback)
trainer.run()


iter_dt 0.00ms; iter 0: train loss 10.35037
iter_dt 140.00ms; iter 100: train loss 7.24491
iter_dt 149.28ms; iter 200: train loss 7.09906
iter_dt 150.49ms; iter 300: train loss 6.72025
iter_dt 145.54ms; iter 400: train loss 7.65948
iter_dt 150.51ms; iter 500: train loss 6.10308
iter_dt 155.29ms; iter 600: train loss 4.81339
iter_dt 155.49ms; iter 700: train loss 4.56344
iter_dt 152.79ms; iter 800: train loss 3.59995
iter_dt 145.43ms; iter 900: train loss 3.75859
iter_dt 152.75ms; iter 1000: train loss 3.70889
iter_dt 156.75ms; iter 1100: train loss 5.32608
iter_dt 157.64ms; iter 1200: train loss 3.58974
iter_dt 145.54ms; iter 1300: train loss 3.13748
iter_dt 153.42ms; iter 1400: train loss 4.14285
iter_dt 157.55ms; iter 1500: train loss 3.27124
iter_dt 162.90ms; iter 1600: train loss 3.22933
iter_dt 153.73ms; iter 1700: train loss 3.98922
iter_dt 156.08ms; iter 1800: train loss 2.12678
iter_dt 152.57ms; iter 1900: train loss 2.25691
iter_dt 150.02ms; iter 2000: train loss 2.64809
iter_

In [9]:
def model_info_to_json(model_name):
    (path := Path("eval")).mkdir(exist_ok=True)

    path_json_l = f"{path}/losses_6.json"
    with open(path_json_l, "w") as f:
        json.dump(losses, f)


In [10]:
model_info_to_json("gpt-mini")
torch.save(model.state_dict(), "gpt_mini.pth")

<!-- @format -->

### Model GPT micro

śr czas na iteracje 80ms
łączna ilość iteracji - 20 000
loss~6


# Generating

In [37]:
(gen_results_path := Path("gen_res")).mkdir(parents=True, exist_ok=True)

In [38]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = "gpt-nano"
# model_config.model_type = "gpt-micro"
model_config.model_type = "gpt-mini"

model_config.vocab_size = len(tokenizer)
model_config.block_size = 1024


model = GPT(model_config).to("cuda")

number of parameters: 8.63M


In [39]:
path_to_model = "gpt_mini.pth"
model.load_state_dict(torch.load(path_to_model))
model.eval()
None

  model.load_state_dict(torch.load(path_to_model))


In [40]:
from tqdm import tqdm
from copy import deepcopy

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
collator.pad_on_left = True
collator.eos_token = None
dataloader_test = DataLoader(dataset_test, batch_size=4, collate_fn=collator)

count = 0
for batch in tqdm(dataloader_test, desc="Testing model / Generating results"):
    res = model.generate(
        idx=batch["input_ids"].to("cuda"), max_new_tokens=800, do_sample=True, top_k=50, temperature=0.87
    )

    # Saves the generated music, as MIDI files and tokens (json)
    for prompt, continuation in zip(batch["input_ids"], res):
        generated = continuation[len(prompt) :]
        midi = tokenizer.decode([deepcopy(generated.tolist())])

        tokens = [generated, prompt, continuation]
        tokens = [seq.tolist() for seq in tokens]

        for tok_seq in tokens[1:]:
            _midi = tokenizer.decode([deepcopy(tok_seq)])
            midi.tracks.append(_midi.tracks[0])

        midi_name = [f"Continuation of original sample ({len(generated)} tokens)", f"Original sample ({len(prompt)} tokens)", f"Original sample and continuation"]

        for i in range(min(len(midi.tracks), len(midi_name))):
            midi.tracks[i].name = midi_name[i]
        midi.dump_midi(gen_results_path / f"{count}.mid")
        

        count += 1

Testing model / Generating results:   0%|          | 10/3114 [05:08<26:35:15, 30.84s/it]


KeyboardInterrupt: 