In [1]:
from pathlib import Path
from random import shuffle

# from evaluate import load as load_metric
from miditok import REMI
from miditok.pytorch_data import DatasetMIDI, DataCollator
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
from torch import Tensor, argmax
from torch.utils.data import DataLoader
from transformers.trainer_utils import set_seed
import sys
import os

# Pobranie ścieżki do folderu głównego projektu (dostosuj, jeśli to konieczne)
project_path = os.path.abspath("../minGPT_")
# Dodanie tej ścieżki do `sys.path`
if project_path not in sys.path:
    sys.path.append(project_path)

# Teraz powinno działać
from minGPT_.projects.midi.midi import MidiDataset

In [2]:
tokenizer_path = Path("tokenizer_filtered.json")
tokenizer = REMI(params=tokenizer_path)

pad_token = tokenizer["PAD_None"]
tokenizer_path

WindowsPath('tokenizer_filtered.json')

In [3]:
dir_name = "filtered_midi"

# Create Dataset and Collator for training

midi_paths_train = list(Path(f"{dir_name}/Maestro_train").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_train").glob("**/*.midi")
)

midi_paths_valid = list(Path(f"{dir_name}/Maestro_valid").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_valid").glob("**/*.midi")
)

midi_paths_test = list(Path(f"{dir_name}/Maestro_test").glob("**/*.mid")) + list(
    Path(f"{dir_name}/Maestro_test").glob("**/*.midi")
)

kwargs_dataset = {
    "max_seq_len": 1024,
    "tokenizer": tokenizer,
    "bos_token_id": tokenizer["BOS_None"],
    "eos_token_id": tokenizer["EOS_None"],
}

dataset_train = DatasetMIDI(midi_paths_train, **kwargs_dataset)

dataset_valid = DatasetMIDI(midi_paths_valid, **kwargs_dataset)

dataset_test = DatasetMIDI(midi_paths_test, **kwargs_dataset)
print(midi_paths_train[0])

filtered_midi\Maestro_train\Blues\(Sittin On) The Dock Of The Bay_0.mid


In [16]:
tran_config = {"max_seq_len": 1024, "pad_token_id": pad_token, "pred_num": 4}

train_dataset = MidiDataset(dataset_train, **tran_config)
valid_dataset = MidiDataset(dataset_valid, **tran_config)
test_dataset = MidiDataset(dataset_test, **tran_config)

In [17]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = "gpt-nano"
# model_config.model_type = "gpt-micro"
model_config.model_type = "gpt-mini"

model_config.vocab_size = len(tokenizer)
model_config.block_size = 1024


model = GPT(model_config)

number of parameters: 8.63M


In [6]:
print(model_config.n_head, model_config.n_layer, model_config.n_embd)

3 3 48


In [22]:
import torch
from mingpt.trainer import Trainer


torch.cuda.empty_cache()

train_config = Trainer.get_default_config()

train_config.learning_rate = (
    2e-6  # the model we're using is so small that we can go a bit faster
)
train_config.batch_size = 4
train_config.max_iters = 5000
train_config.num_workers = 4
# train_config.weight_decay = 0.01
train_config.lr_decay = True
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [23]:
import json
import torch


# losses = []


# def model_info_to_json(model_name):
#     path = Path(f"eval/{model_name}")
#     torch.save(model.state_dict(), f"{path}.pth")
#     path_json = path.with_suffix(".json")

#     json.dump(losses, open(path_json, "w"))
#     print(f"Model weights saved to {path}.pth")
#     print(f"Model losses saved to {path_json}")


losses = []
def model_info_to_json(model_name):
    (path := Path("eval")).mkdir(exist_ok=True)
    
    path_json_l = f"{path}/losses.json"
    # path_json_c = f"{path}/config.json"
    # path_torch = f"{path}/model.pth"

    # torch.save(model.state_dict(), path_torch)
    json.dump(losses, path_json_l)
    # json.dump(config, open(path_json_c, "w"))
    # print(f"Model weights saved to {path_torch}")
    # print(f"Model losses saved to {path_json_l}")
    # print(f"Model config saved to {path_json_c}")


def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        losses.append([iter ,trainer.loss.item()])
    
trainer.set_callback("on_batch_end", batch_end_callback)
trainer.run()


iter_dt 0.00ms; iter 0: train loss 1.93820
iter_dt 208.42ms; iter 100: train loss 2.21672
iter_dt 204.29ms; iter 200: train loss 1.65771
iter_dt 198.80ms; iter 300: train loss 2.08820
iter_dt 184.01ms; iter 400: train loss 1.70446
iter_dt 202.76ms; iter 500: train loss 1.75010
iter_dt 218.72ms; iter 600: train loss 1.93416
iter_dt 221.95ms; iter 700: train loss 2.03775
iter_dt 167.96ms; iter 800: train loss 2.21131
iter_dt 155.20ms; iter 900: train loss 2.08227
iter_dt 152.00ms; iter 1000: train loss 1.95207
iter_dt 156.73ms; iter 1100: train loss 2.44804
iter_dt 301.43ms; iter 1200: train loss 2.22825


KeyboardInterrupt: 

In [9]:
#model_info_to_json("gpt-mini")
torch.save(model.state_dict(), "gpt_mini_test.pth")

<!-- @format -->

### Model GPT micro

śr czas na iteracje 80ms
łączna ilość iteracji - 20 000
loss~6


In [10]:
model.eval()
None

In [11]:
(gen_results_path := Path("gen_res")).mkdir(parents=True, exist_ok=True)

In [12]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = "gpt-mini"
model_config.model_type = "gpt-nano"

model_config.vocab_size = len(tokenizer)
model_config.block_size = 1024


model_loaded = GPT(model_config)
path_to_model = Path("trained/model_nano_1_0.pth")

number of parameters: 1.57M


In [17]:
model_loaded.load_state_dict(torch.load(path_to_model))

  model_loaded.load_state_dict(torch.load(path_to_model))


<All keys matched successfully>

In [14]:
from tqdm import tqdm
from copy import deepcopy

collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True)
collator.pad_on_left = True
collator.eos_token = None
dataloader_test = DataLoader(dataset_test, batch_size=4, collate_fn=collator)

count = 0
for batch in tqdm(dataloader_test, desc="Testing model / Generating results"):
    res = model.generate(
        idx=batch["input_ids"].to("cuda"), max_new_tokens=100, do_sample=True, top_k=10
    )

    # Saves the generated music, as MIDI files and tokens (json)
    for prompt, continuation in zip(batch["input_ids"], res):
        generated = continuation[len(prompt) :]
        midi = tokenizer.decode([deepcopy(generated.tolist())])

        tokens = [generated, prompt, continuation]
        tokens = [seq.tolist() for seq in tokens]

        for tok_seq in tokens[1:]:
            _midi = tokenizer.decode([deepcopy(tok_seq)])
            midi.tracks.append(_midi.tracks[0])

        # midi.tracks[0].name = (
        #     f"Continuation of original sample ({len(generated)} tokens)"
        # )
        # midi.tracks[1].name = f"Original sample ({len(prompt)} tokens)"
        # midi.tracks[2].name = f"Original sample and continuation"
        midi_name = [f"Continuation of original sample ({len(generated)} tokens)", f"Original sample ({len(prompt)} tokens)", f"Original sample and continuation"]

        for i in range(min(len(midi.tracks), len(midi_name))):
            midi.tracks[i].name = midi_name[i]
        midi.dump_midi(gen_results_path / f"{count}.mid")
        # tokenizer.save_tokens(tokens, gen_results_path / f'{count}.json')

        count += 1

Testing model / Generating results:   0%|          | 3/2172 [00:39<7:55:39, 13.16s/it]


KeyboardInterrupt: 