In [1]:
import os
import csv
import zlib
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, GCTRootPCTokenizer, \
    GCTSymbolTokenizer, GCTRootTypeTokenizer, MelodyPitchTokenizer, \
    MergedMelHarmTokenizer

In [2]:
# root_dir = '/media/datadisk/datasets/hooktheory_xmls'
root_dir = 'data/gjt_melodies/Library_melodies/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.xml') or file.endswith('.mxl'):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 650


In [3]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [4]:
print('ChordSymbolTokenizer_m21')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer_m21
len(chordSymbolTokenizer.vocab):  456


  return self.iter().getElementsByClass(classFilterList)
Processing Files: 100%|██████████| 650/650 [00:35<00:00, 18.22it/s]

example sentence length:  58
['<h>', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'position_0x00', 'G:min7', '<bar>', 'position_0x00', 'G#:maj7', 'position_1x50', 'G:min7', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'position_0x00', 'G:min7', '<bar>', 'position_0x00', 'G#:maj7', 'position_1x50', 'G:min7', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'position_0x00', 'D#:maj', '<bar>', 'position_0x00', 'D#:maj', '<bar>', 'position_0x00', 'C#:maj', 'position_1x50', 'D#:maj', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'position_0x00', 'D#:maj', '<bar>', 'position_0x00', 'D#:maj', '<bar>', 'position_0x00', 'C#:maj', 'position_1x50', 'D#:maj', '<bar>', 'position_0x00', 'F:min7', '</s>']
[7, 6, 8, 261, 6, 8, 319, 6, 8, 347, 20, 319, 6, 8, 261, 6, 8, 261, 6, 8, 319, 6, 8, 347, 20, 319, 6, 8, 261, 6, 8, 195, 6, 8, 195, 6, 8, 137, 20, 195, 6, 8, 261, 6, 8, 195, 6, 8, 195, 6, 8, 137, 20, 195, 6, 8, 261, 3]
vocab_size:  456
mean len:  105.60307692307693
std




In [5]:
print('RootTypeTokenizer')
rootTypeTokenizer = RootTypeTokenizer()
print('len(rootTypeTokenizer.vocab): ', len(rootTypeTokenizer.vocab))
initialize_stats('RootTypeTokenizer', rootTypeTokenizer)
toks_rt = rootTypeTokenizer(data_files)
print('example sentence length: ', len(toks_rt['tokens'][0]))
print(toks_rt['tokens'][0])
print(toks_rt['ids'][0])
update_stats('RootTypeTokenizer', toks_rt)
print_stats('RootTypeTokenizer')

RootTypeTokenizer
len(rootTypeTokenizer.vocab):  149


Processing Files: 100%|██████████| 650/650 [01:03<00:00, 10.26it/s]

example sentence length:  78
['<h>', '<bar>', 'position_0x00', 'F', 'min7', '<bar>', 'position_0x00', 'G', 'min7', '<bar>', 'position_0x00', 'G#', 'maj7', 'position_1x50', 'G', 'min7', '<bar>', 'position_0x00', 'F', 'min7', '<bar>', 'position_0x00', 'F', 'min7', '<bar>', 'position_0x00', 'G', 'min7', '<bar>', 'position_0x00', 'G#', 'maj7', 'position_1x50', 'G', 'min7', '<bar>', 'position_0x00', 'F', 'min7', '<bar>', 'position_0x00', 'D#', 'maj', '<bar>', 'position_0x00', 'D#', 'maj', '<bar>', 'position_0x00', 'C#', 'maj', 'position_1x50', 'D#', 'maj', '<bar>', 'position_0x00', 'F', 'min7', '<bar>', 'position_0x00', 'D#', 'maj', '<bar>', 'position_0x00', 'D#', 'maj', '<bar>', 'position_0x00', 'C#', 'maj', 'position_1x50', 'D#', 'maj', '<bar>', 'position_0x00', 'F', 'min7', '</s>']
[7, 6, 8, 113, 128, 6, 8, 115, 128, 6, 8, 116, 127, 20, 115, 128, 6, 8, 113, 128, 6, 8, 113, 128, 6, 8, 115, 128, 6, 8, 116, 127, 20, 115, 128, 6, 8, 113, 128, 6, 8, 111, 120, 6, 8, 111, 120, 6, 8, 109, 120, 2




In [6]:
print('PitchClassTokenizer')
pitchClassTokenizer = PitchClassTokenizer()
print('len(pitchClassTokenizer.vocab): ', len(pitchClassTokenizer.vocab))
initialize_stats('PitchClassTokenizer', pitchClassTokenizer)
toks_pc = pitchClassTokenizer(data_files)
print('example sentence length: ', len(toks_pc['tokens'][0]))
print(toks_pc['tokens'][0])
print(toks_pc['ids'][0])
update_stats('PitchClassTokenizer', toks_pc)
print_stats('PitchClassTokenizer')

PitchClassTokenizer
len(pitchClassTokenizer.vocab):  120


Processing Files: 100%|██████████| 650/650 [00:46<00:00, 13.98it/s]

example sentence length:  110
['<h>', '<bar>', 'position_0x00', 'chord_pc_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_pc_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', 'chord_pc_7', 'position_1x50', 'chord_pc_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_pc_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_pc_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_pc_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', 'chord_pc_7', 'position_1x50', 'chord_pc_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_pc_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_pc_3', 'chord_pc_7', 'chord_pc_10', '<bar>', 'position_0x00', 'chord_pc_3', 'chord_pc_7', 'chord_pc_10', '<bar>', 'position




In [7]:
print('RootPCTokenizer')
rootPCTokenizer = RootPCTokenizer()
print('len(rootPCTokenizer.vocab): ', len(rootPCTokenizer.vocab))
initialize_stats('RootPCTokenizer', rootPCTokenizer)
toks_rpc = rootPCTokenizer(data_files)
print('example sentence length: ', len(toks_rpc['tokens'][0]))
print(toks_rpc['tokens'][0])
print(toks_rpc['ids'][0])
update_stats('RootPCTokenizer', toks_rpc)
print_stats('RootPCTokenizer')

RootPCTokenizer
len(rootPCTokenizer.vocab):  132


Processing Files: 100%|██████████| 650/650 [01:06<00:00,  9.76it/s]

example sentence length:  110
['<h>', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_root_8', 'chord_pc_0', 'chord_pc_3', 'chord_pc_7', 'position_1x50', 'chord_root_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_root_8', 'chord_pc_0', 'chord_pc_3', 'chord_pc_7', 'position_1x50', 'chord_root_7', 'chord_pc_10', 'chord_pc_2', 'chord_pc_5', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_3', 'chord_pc_7', 'chord_pc_10', '<bar>', 'position_0x00', 'chord_root_3', 'chord_pc_7', 'chord_pc




In [8]:
print('GCTRootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer()
print('len(gctRootPCTokenizer.vocab): ', len(gctRootPCTokenizer.vocab))
initialize_stats('GCTRootPCTokenizer', gctRootPCTokenizer)
toks_gct_rpc = gctRootPCTokenizer(data_files)
print('example sentence length: ', len(toks_gct_rpc['tokens'][0]))
print(toks_gct_rpc['tokens'][0])
print(toks_gct_rpc['ids'][0])
update_stats('GCTRootPCTokenizer', toks_gct_rpc)
print_stats('GCTRootPCTokenizer')

GCTRootPCTokenizer
len(gctRootPCTokenizer.vocab):  132


Processing Files: 100%|██████████| 650/650 [01:21<00:00,  7.99it/s]

example sentence length:  110
['<h>', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_10', 'chord_pc_2', 'chord_pc_5', 'chord_pc_7', '<bar>', 'position_0x00', 'chord_root_0', 'chord_pc_3', 'chord_pc_7', 'chord_pc_8', 'position_1x50', 'chord_root_10', 'chord_pc_2', 'chord_pc_5', 'chord_pc_7', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_10', 'chord_pc_2', 'chord_pc_5', 'chord_pc_7', '<bar>', 'position_0x00', 'chord_root_0', 'chord_pc_3', 'chord_pc_7', 'chord_pc_8', 'position_1x50', 'chord_root_10', 'chord_pc_2', 'chord_pc_5', 'chord_pc_7', '<bar>', 'position_0x00', 'chord_root_5', 'chord_pc_8', 'chord_pc_0', 'chord_pc_3', '<bar>', 'position_0x00', 'chord_root_3', 'chord_pc_7', 'chord_pc_10', '<bar>', 'position_0x00', 'chord_root_3', 'chord_pc_7', 'chord_pc




In [9]:
print('GCTSymbolTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer()
print('training')
gctSymbolTokenizer.fit( data_files )
print('len(gctSymbolTokenizer.vocab): ', len(gctSymbolTokenizer.vocab))
initialize_stats('GCTSymbolTokenizer', gctSymbolTokenizer)
toks_gct_symb = gctSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_gct_symb['tokens'][0]))
print(toks_gct_symb['tokens'][0])
print(toks_gct_symb['ids'][0])
update_stats('GCTSymbolTokenizer', toks_gct_symb)
print_stats('GCTSymbolTokenizer')

GCTSymbolTokenizer
training


Processing Files: 100%|██████████| 650/650 [01:18<00:00,  8.28it/s]


len(gctSymbolTokenizer.vocab):  336


Processing Files: 100%|██████████| 650/650 [01:00<00:00, 10.69it/s]

example sentence length:  58
['<h>', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '<bar>', 'position_0x00', '[10  0  4  7  9]', '<bar>', 'position_0x00', '[0 0 3 7 8]', 'position_1x50', '[10  0  4  7  9]', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '<bar>', 'position_0x00', '[10  0  4  7  9]', '<bar>', 'position_0x00', '[0 0 3 7 8]', 'position_1x50', '[10  0  4  7  9]', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '<bar>', 'position_0x00', '[3 0 4 7]', '<bar>', 'position_0x00', '[3 0 4 7]', '<bar>', 'position_0x00', '[1 0 4 7]', 'position_1x50', '[3 0 4 7]', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '<bar>', 'position_0x00', '[3 0 4 7]', '<bar>', 'position_0x00', '[3 0 4 7]', '<bar>', 'position_0x00', '[1 0 4 7]', 'position_1x50', '[3 0 4 7]', '<bar>', 'position_0x00', '[ 5  0  3  7 10]', '</s>']
[7, 6, 8, 108, 6, 8, 109, 6, 8, 110, 20, 109, 6, 8, 108, 6, 8, 108, 6, 8, 109, 6, 8, 110, 20, 109, 6, 8, 108, 6, 8, 111, 6, 8, 111, 6, 8,




In [5]:
print('GCTRootTypeTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer()
print('training')
gctRootTypeTokenizer.fit( data_files )
print('len(gctRootTypeTokenizer.vocab): ', len(gctRootTypeTokenizer.vocab))
initialize_stats('GCTRootTypeTokenizer', gctRootTypeTokenizer)
toks_gct_rt = gctRootTypeTokenizer(data_files)
print('example sentence length: ', len(toks_gct_rt['tokens'][0]))
print(toks_gct_rt['tokens'][0])
print(toks_gct_rt['ids'][0])
update_stats('GCTRootTypeTokenizer', toks_gct_rt)
print_stats('GCTRootTypeTokenizer')

GCTRootTypeTokenizer
training


Processing Files: 100%|██████████| 650/650 [01:15<00:00,  8.63it/s]


len(gctRootTypeTokenizer.vocab):  165


Processing Files: 100%|██████████| 650/650 [01:30<00:00,  7.18it/s]

example sentence length:  78
['<h>', '<bar>', 'position_0x00', 'chord_root_5', '[ 0  3  7 10]', '<bar>', 'position_0x00', 'chord_root_10', '[0 4 7 9]', '<bar>', 'position_0x00', 'chord_root_0', '[0 3 7 8]', 'position_1x50', 'chord_root_10', '[0 4 7 9]', '<bar>', 'position_0x00', 'chord_root_5', '[ 0  3  7 10]', '<bar>', 'position_0x00', 'chord_root_5', '[ 0  3  7 10]', '<bar>', 'position_0x00', 'chord_root_10', '[0 4 7 9]', '<bar>', 'position_0x00', 'chord_root_0', '[0 3 7 8]', 'position_1x50', 'chord_root_10', '[0 4 7 9]', '<bar>', 'position_0x00', 'chord_root_5', '[ 0  3  7 10]', '<bar>', 'position_0x00', 'chord_root_3', '[0 4 7]', '<bar>', 'position_0x00', 'chord_root_3', '[0 4 7]', '<bar>', 'position_0x00', 'chord_root_1', '[0 4 7]', 'position_1x50', 'chord_root_3', '[0 4 7]', '<bar>', 'position_0x00', 'chord_root_5', '[ 0  3  7 10]', '<bar>', 'position_0x00', 'chord_root_3', '[0 4 7]', '<bar>', 'position_0x00', 'chord_root_3', '[0 4 7]', '<bar>', 'position_0x00', 'chord_root_1', '




In [6]:
print('MelodyPitchTokenizer_m21')
melodyPitchTokenizer = MelodyPitchTokenizer(min_pitch=21, max_pitch=108) #default range, need to adjust
print('len(melodyPitchTokenizer.vocab): ', len(melodyPitchTokenizer.vocab))
initialize_stats('MelodyPitchTokenizer', melodyPitchTokenizer)
toks_cs = melodyPitchTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('MelodyPitchTokenizer', toks_cs)
print_stats('MelodyPitchTokenizer')

MelodyPitchTokenizer_m21
len(melodyPitchTokenizer.vocab):  195


Processing Melody Files: 100%|██████████| 650/650 [00:32<00:00, 19.87it/s]

example sentence length:  78
['<s>', '<bar>', 'position_0x00', 'P:65', 'position_1x00', 'P:72', 'position_2x50', 'P:68', '<bar>', 'position_0x00', 'P:70', '<bar>', 'position_0x00', 'P:68', 'position_1x50', 'P:70', '<bar>', 'position_0x00', 'P:65', 'position_2x00', '<rest>', 'position_2x50', 'P:60', '<bar>', 'position_0x00', 'P:65', 'position_1x00', 'P:72', 'position_2x50', 'P:68', '<bar>', 'position_0x00', 'P:70', '<bar>', 'position_0x00', 'P:68', 'position_1x50', 'P:70', '<bar>', 'position_0x00', 'P:65', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:63', 'position_2x00', 'P:67', '<bar>', 'position_0x00', 'P:63', '<bar>', 'position_0x00', 'P:61', 'position_1x50', 'P:63', '<bar>', 'position_0x00', 'P:65', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:63', 'position_2x00', 'P:67', '<bar>', 'position_0x00', 'P:63', '<bar>', 'position_0x00', 'P:61', 'position_1x50', 'P:68', '<bar>', 'position_0x00', 'P:65', '</s>']
[2, 6, 95, 51, 103, 58, 115, 54, 6, 95, 56, 6, 95, 54, 107,




In [12]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'MelodyPitchTokenizer'
              ]

results_path = 'vocab_stats_hk_m21.csv' #for hook theory

result_fields = ['Tokenizer_m21', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )

In [7]:
m_chordSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, chordSymbolTokenizer, verbose=1)
#m_rootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootTypeTokenizer)
#m_pitchClassTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, pitchClassTokenizer)
#m_rootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootPCTokenizer)
#m_gctRootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootPCTokenizer)
#m_gctSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctSymbolTokenizer)
#m_gctRootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootTypeTokenizer)

Merging harmony vocab


In [8]:
# Fixing combined MergedMelHarmTokenizer
print('Length of combined vocab:', len(m_chordSymbolTokenizer.vocab))
print('Combined vocab:', m_chordSymbolTokenizer.vocab)

m_chordSymbolTokenizer.fit( data_files )
toks_symb_m = m_chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_symb_m['tokens'][0]))
print(toks_symb_m['tokens'][0])
print(toks_symb_m['ids'][0])



Length of combined vocab: 545
Combined vocab: {'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<rest>': 4, '<mask>': 5, '<bar>': 6, 'P:21': 7, 'P:22': 8, 'P:23': 9, 'P:24': 10, 'P:25': 11, 'P:26': 12, 'P:27': 13, 'P:28': 14, 'P:29': 15, 'P:30': 16, 'P:31': 17, 'P:32': 18, 'P:33': 19, 'P:34': 20, 'P:35': 21, 'P:36': 22, 'P:37': 23, 'P:38': 24, 'P:39': 25, 'P:40': 26, 'P:41': 27, 'P:42': 28, 'P:43': 29, 'P:44': 30, 'P:45': 31, 'P:46': 32, 'P:47': 33, 'P:48': 34, 'P:49': 35, 'P:50': 36, 'P:51': 37, 'P:52': 38, 'P:53': 39, 'P:54': 40, 'P:55': 41, 'P:56': 42, 'P:57': 43, 'P:58': 44, 'P:59': 45, 'P:60': 46, 'P:61': 47, 'P:62': 48, 'P:63': 49, 'P:64': 50, 'P:65': 51, 'P:66': 52, 'P:67': 53, 'P:68': 54, 'P:69': 55, 'P:70': 56, 'P:71': 57, 'P:72': 58, 'P:73': 59, 'P:74': 60, 'P:75': 61, 'P:76': 62, 'P:77': 63, 'P:78': 64, 'P:79': 65, 'P:80': 66, 'P:81': 67, 'P:82': 68, 'P:83': 69, 'P:84': 70, 'P:85': 71, 'P:86': 72, 'P:87': 73, 'P:88': 74, 'P:89': 75, 'P:90': 76, 'P:91': 77, 'P:92': 78, 'P:93': 7

Processing Melody Files: 100%|██████████| 650/650 [00:31<00:00, 20.50it/s]


Processing harmony


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.50it/s]

example sentence length:  136
['<s>', '<bar>', 'position_0x00', 'P:65', 'position_1x00', 'P:72', 'position_2x50', 'P:68', '<bar>', 'position_0x00', 'P:70', '<bar>', 'position_0x00', 'P:68', 'position_1x50', 'P:70', '<bar>', 'position_0x00', 'P:65', 'position_2x00', '<rest>', 'position_2x50', 'P:60', '<bar>', 'position_0x00', 'P:65', 'position_1x00', 'P:72', 'position_2x50', 'P:68', '<bar>', 'position_0x00', 'P:70', '<bar>', 'position_0x00', 'P:68', 'position_1x50', 'P:70', '<bar>', 'position_0x00', 'P:65', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:63', 'position_2x00', 'P:67', '<bar>', 'position_0x00', 'P:63', '<bar>', 'position_0x00', 'P:61', 'position_1x50', 'P:63', '<bar>', 'position_0x00', 'P:65', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:63', 'position_2x00', 'P:67', '<bar>', 'position_0x00', 'P:63', '<bar>', 'position_0x00', 'P:61', 'position_1x50', 'P:68', '<bar>', 'position_0x00', 'P:65', '</s>', '<h>', '<bar>', 'position_0x00', 'F:min7', '<bar>', 'posit




In [9]:
print(m_chordSymbolTokenizer.convert_tokens_to_ids(['<mask>', 'C:7']))
print(m_chordSymbolTokenizer.convert_tokens_to_ids(['ts_4x4', 'position_7x33']))

[5, 203]
[180, 154]
