In [42]:
# USER OPTIONS
# define tokenizer name - should be one among the keys in the cell below
tokenizer_name = 'ChordSymbolTokenizer' # or any other name from the keys in tokenizers dictionary
# folder to xmls
val_dir = '/media/maindisk/maximos/data/gjt_melodies/Library_melodies'
# generation or MLM
generation = False # True if generation, False is MLM
# define batch size depending on GPU availability / status
batchsize = 16
# select device name - could be 'cpu', 'cuda', 'coda:0', 'cuda:1'...
device_name = 'cuda'

In [41]:
from data_utils import MergedMelHarmDataset, MLMCollator, GenCollator
import os
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, GCTRootPCTokenizer, \
    GCTSymbolTokenizer, GCTRootTypeTokenizer, MelodyPitchTokenizer, \
    MergedMelHarmTokenizer
from torch.utils.data import DataLoader
from transformers import RobertaConfig, RobertaForMaskedLM, AutoConfig, GPT2LMHeadModel
import torch
from tqdm import tqdm

In [43]:
tokenizers = {
    'ChordSymbolTokenizer': ChordSymbolTokenizer,
    'RootTypeTokenizer': RootTypeTokenizer,
    'PitchClassTokenizer': PitchClassTokenizer,
    'RootPCTokenizer': RootPCTokenizer,
    'GCTRootPCTokenizer': GCTRootPCTokenizer,
    'GCTSymbolTokenizer': GCTSymbolTokenizer,
    'GCTRootTypeTokenizer': GCTRootTypeTokenizer
}

In [44]:
melody_tokenizer = MelodyPitchTokenizer.from_pretrained('saved_tokenizers/MelodyPitchTokenizer')
harmony_tokenizer = tokenizers[tokenizer_name].from_pretrained('saved_tokenizers/' + tokenizer_name)

tokenizer = MergedMelHarmTokenizer(melody_tokenizer, harmony_tokenizer)

In [45]:
if generation:
    collator = GenCollator(tokenizer)
    val_dataset = MergedMelHarmDataset(val_dir, tokenizer, max_length=2048, return_harmonization_labels=True)
    model_path = 'saved_models/gen/' + tokenizer_name + '/' + tokenizer_name + '.pt'
else:
    collator = MLMCollator(tokenizer)
    val_dataset = MergedMelHarmDataset(val_dir, tokenizer, max_length=2048)
    model_path = 'saved_models/mlm/' + tokenizer_name + '/' + tokenizer_name + '.pt'

valloader = DataLoader(val_dataset, batch_size=batchsize, shuffle=True, collate_fn=collator)

In [None]:
if generation:
    config = AutoConfig.from_pretrained(
        "gpt2",
        vocab_size=len(tokenizer.vocab),
        n_positions=2048,
        n_layer=4,
        n_head=4,
        pad_token_id=tokenizer.vocab[tokenizer.pad_token],
        bos_token_id=tokenizer.vocab[tokenizer.bos_token],
        eos_token_id=tokenizer.vocab[tokenizer.eos_token],
        n_embd=256
    )

    model = GPT2LMHeadModel(config)
else:
    model_config = RobertaConfig(
        vocab_size=len(tokenizer.vocab),
        hidden_size=256,
        num_hidden_layers=4,
        num_attention_heads=4,
        pad_token_id=tokenizer.vocab[tokenizer.pad_token],
        bos_token_id=tokenizer.vocab[tokenizer.bos_token],
        eos_token_id=tokenizer.vocab[tokenizer.eos_token],
        mask_token_id=tokenizer.vocab[tokenizer.mask_token],
        max_position_embeddings=2048,
    )

    model = RobertaForMaskedLM(model_config)
# end if

checkpoint = torch.load(model_path, map_location="cpu", weights_only=True)
model.load_state_dict(checkpoint)

model.eval()

In [None]:
if device_name == 'cpu':
    device = torch.device('cpu')
else:
    if torch.cuda.is_available():
        device = torch.device(device_name)
    else:
        print('Selected device not available: ' + device_name)
model.to(device)

In [None]:
val_loss = 0
running_loss = 0
batch_num = 0
running_accuracy = 0
val_accuracy = 0
print('validation')
with torch.no_grad():
    with tqdm(valloader, unit='batch') as tepoch:
        tepoch.set_description(f'Running')
        print(tepoch)
        for batch in tepoch:
            input_ids = batch['input_ids'].to(device)
            if generation:
                attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            if generation:
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            else:
                outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            
            # update loss
            batch_num += 1
            running_loss += loss.item()
            val_loss = running_loss/batch_num
            # accuracy
            if generation:
                predictions = outputs.logits.argmax(dim=-1).roll(shifts=(0,1), dims=(0,1))
            else:
                predictions = outputs.logits.argmax(dim=-1)
            mask = labels != -100
            running_accuracy += (predictions[mask] == labels[mask]).sum().item()/mask.sum().item()
            val_accuracy = running_accuracy/batch_num
            
            tepoch.set_postfix(loss=val_loss, accuracy=val_accuracy)