In [21]:
from data_utils import MergedMelHarmDataset, MLMCollator
import os
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, GCTRootPCTokenizer, \
    GCTSymbolTokenizer, GCTRootTypeTokenizer, MelodyPitchTokenizer, \
    MergedMelHarmTokenizer
from torch.utils.data import DataLoader
from transformers import RobertaConfig, RobertaForMaskedLM
import torch
from torch.optim import AdamW
from tqdm import tqdm

In [22]:
# root_dir = '/media/datadisk/datasets/hooktheory_xmls'
# root_dir = 'data/gjt_melodies/Library_melodies/'
root_dir = '/media/maindisk/maximos/data/gjt_melodies/Library_melodies/'
# data_files = []

# # Walk through all subdirectories and files
# for dirpath, _, filenames in os.walk(root_dir):
#     for file in filenames:
#         if file.endswith('.xml') or file.endswith('.mxl'):
#             full_path = os.path.join(dirpath, file)
#             data_files.append(full_path)

# print('Total files from Hook Theory dataset:', len(data_files))

In [23]:
chordSymbolTokenizer = ChordSymbolTokenizer.from_pretrained('saved_tokenizers/ChordSymbolTokenizer')
rootTypeTokenizer = RootTypeTokenizer.from_pretrained('saved_tokenizers/RootTypeTokenizer')
pitchClassTokenizer = PitchClassTokenizer.from_pretrained('saved_tokenizers/PitchClassTokenizer')
rootPCTokenizer = RootPCTokenizer.from_pretrained('saved_tokenizers/RootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer.from_pretrained('saved_tokenizers/GCTRootPCTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer.from_pretrained('saved_tokenizers/GCTSymbolTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer.from_pretrained('saved_tokenizers/GCTRootTypeTokenizer')
melodyPitchTokenizer = MelodyPitchTokenizer.from_pretrained('saved_tokenizers/MelodyPitchTokenizer')

In [24]:
m_chordSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, chordSymbolTokenizer)
m_rootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootTypeTokenizer)
m_pitchClassTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, pitchClassTokenizer)
m_rootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootPCTokenizer)
m_gctRootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootPCTokenizer)
m_gctSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctSymbolTokenizer)
m_gctRootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootTypeTokenizer)

In [25]:
tokenizer = m_chordSymbolTokenizer

dataset = MergedMelHarmDataset(root_dir, tokenizer, max_length=2048)
collator = MLMCollator(tokenizer)

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collator)


In [26]:
b = next(iter(dataloader))
print(b)

{'input_ids': tensor([[ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  5, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1],
        ...,
        [ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100,    6, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])}


In [27]:
print(b['input_ids'])
print(b['labels'])

tensor([[ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  5, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1],
        ...,
        [ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1],
        [ 2,  6, 95,  ...,  1,  1,  1]])
tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100,    6, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])


In [28]:
model_config = RobertaConfig(
    vocab_size=len(tokenizer.vocab),
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    pad_token_id=tokenizer.vocab[tokenizer.pad_token],
    bos_token_id=tokenizer.vocab[tokenizer.bos_token],
    eos_token_id=tokenizer.vocab[tokenizer.eos_token],
    mask_token_id=tokenizer.vocab[tokenizer.mask_token],
    max_position_embeddings=2048,
)

In [29]:
model = RobertaForMaskedLM(model_config)
model.train()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(545, 256, padding_idx=1)
      (position_embeddings): Embedding(2048, 256, padding_idx=1)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-3): 4 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm):

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Number of epochs
    train_loss = 0
    running_loss = 0
    batch_num = 0
    running_accuracy = 0
    train_accuracy = 0
    with tqdm(dataloader, unit='batch') as tepoch:
        tepoch.set_description(f"Epoch {epoch} | trn")
        for batch in tepoch:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update loss
            batch_num += 1
            running_loss += loss.item()
            train_loss = running_loss/batch_num
            # accuracy
            predictions = outputs.logits.argmax(dim=-1)
            mask = labels != -100
            running_accuracy += (predictions[mask] == labels[mask]).sum().item()/mask.sum().item()
            train_accuracy = running_accuracy/batch_num
            
            tepoch.set_postfix(loss=train_loss, accuracy=train_accuracy) # tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy)

Epoch 0 | trn:   0%|          | 0/41 [00:00<?, ?batch/s]

Epoch 0 | trn:   0%|          | 0/41 [00:01<?, ?batch/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 384.00 MiB. GPU 0 has a total capacity of 9.77 GiB of which 235.06 MiB is free. Including non-PyTorch memory, this process has 9.52 GiB memory in use. Of the allocated memory 9.05 GiB is allocated by PyTorch, and 209.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)