In [6]:
import os
from data_utils import MergedMelHarmDataset, PureGenCollator
from harmony_tokenizers_m21 import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, GCTRootPCTokenizer, \
    GCTSymbolTokenizer, GCTRootTypeTokenizer, MelodyPitchTokenizer, \
    MergedMelHarmTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
import pandas as pd

In [7]:
root_dir = '/media/maindisk/maximos/data/hooktheory_test'

data_files = []
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.xml') or file.endswith('.mxl'):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)
print(len(data_files))

1520


In [8]:
tokenizers = {
    'ChordSymbolTokenizer': ChordSymbolTokenizer,
    'RootTypeTokenizer': RootTypeTokenizer,
    'PitchClassTokenizer': PitchClassTokenizer,
    'RootPCTokenizer': RootPCTokenizer,
    'GCTRootPCTokenizer': GCTRootPCTokenizer,
    'GCTSymbolTokenizer': GCTSymbolTokenizer,
    'GCTRootTypeTokenizer': GCTRootTypeTokenizer
}

In [9]:
tokenizer_name = 'ChordSymbolTokenizer'
val_dir = root_dir
batchsize = 16

melody_tokenizer = MelodyPitchTokenizer.from_pretrained('saved_tokenizers/MelodyPitchTokenizer')
harmony_tokenizer = tokenizers[tokenizer_name].from_pretrained('saved_tokenizers/' + tokenizer_name)

tokenizer = MergedMelHarmTokenizer(melody_tokenizer, harmony_tokenizer)

val_dataset = MergedMelHarmDataset(val_dir, tokenizer, max_length=512, return_harmonization_labels=True, num_bars=8)
collator = PureGenCollator(tokenizer)

valloader = DataLoader(val_dataset, batch_size=batchsize, shuffle=False, collate_fn=collator)

In [10]:
# tmp_counter = 0
# with tqdm(valloader, unit='batch') as tepoch:
#     tepoch.set_description(f'run')
#     for batch in tepoch:
#         for b in batch['input_ids']:
#             tmp_counter += 1
# print(tmp_counter)

In [11]:
with open('tokenized/gpt_1.0/ChordSymbolTokenizer.pickle', "rb") as input_file:
    p = pickle.load(input_file)
m = p['generated']
print(len(m))

1520


In [13]:
c = pd.read_csv( 'tokenized/gpt_0.8/ChordSymbolTokenizer.csv' )
print(len(c['generated']))
print(c['generated'].size)
print(c['generated'])

1520
1520
0       <h> <bar> position_0x00 C:maj <bar> position_0...
1       <h> <bar> position_0x00 C:maj <bar> position_0...
2       <h> <bar> position_0x00 G:maj <bar> position_0...
3       <h> <bar> position_0x00 C:maj position_2x00 F:...
4       <h> <bar> position_0x00 D:min position_3x50 C:...
                              ...                        
1515    <h> <bar> position_0x00 C:maj position_2x00 D:...
1516    <h> <bar> position_0x00 F:maj position_1x50 F:...
1517    <h> <bar> position_0x00 A:min <bar> position_0...
1518    <h> <bar> position_0x00 F:maj position_2x00 C:...
1519    <h> <bar> position_0x00 E:min <bar> position_0...
Name: generated, Length: 1520, dtype: object


In [15]:
tokenized_folders = ['gpt_1.0', 'gpt_0.8', 'gpt_1.2',\
                     'bart_1.0', 'bart_0.8', 'bart_1.2']
tokenizer_names = ['ChordSymbolTokenizer', 'RootTypeTokenizer', \
              'PitchClassTokenizer', 'RootPCTokenizer']
for tok_folder in tokenized_folders:
    for tokenizer_name in tokenizer_names:
        c = pd.read_csv( 'tokenized/' + tok_folder + '/' + \
                        tokenizer_name + '.csv' )
        print(tok_folder, '-', tokenizer_name, ':', len(c['generated']))
        assert len(c['generated']) == 1520, 'Error: ' + tokenizer_name

gpt_1.0 - ChordSymbolTokenizer : 1520
gpt_1.0 - RootTypeTokenizer : 1520
gpt_1.0 - PitchClassTokenizer : 1520
gpt_1.0 - RootPCTokenizer : 1520
gpt_0.8 - ChordSymbolTokenizer : 1520
gpt_0.8 - RootTypeTokenizer : 1520
gpt_0.8 - PitchClassTokenizer : 1520
gpt_0.8 - RootPCTokenizer : 1520
gpt_1.2 - ChordSymbolTokenizer : 1520
gpt_1.2 - RootTypeTokenizer : 1520
gpt_1.2 - PitchClassTokenizer : 1520
gpt_1.2 - RootPCTokenizer : 1520
bart_1.0 - ChordSymbolTokenizer : 1520
bart_1.0 - RootTypeTokenizer : 1520
bart_1.0 - PitchClassTokenizer : 1520
bart_1.0 - RootPCTokenizer : 1520
bart_0.8 - ChordSymbolTokenizer : 1520
bart_0.8 - RootTypeTokenizer : 1520
bart_0.8 - PitchClassTokenizer : 1520
bart_0.8 - RootPCTokenizer : 1520
bart_1.2 - ChordSymbolTokenizer : 1520
bart_1.2 - RootTypeTokenizer : 1520
bart_1.2 - PitchClassTokenizer : 1520
bart_1.2 - RootPCTokenizer : 1520
