In [1]:
import os
import csv
import zlib
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, GCTRootPCTokenizer, \
    GCTSymbolTokenizer, GCTRootTypeTokenizer, MelodyPitchTokenizer, \
    MergedMelHarmTokenizer

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# root_dir = '/media/datadisk/datasets/hooktheory_xmls'
root_dir = 'data/gjt_melodies/Library_melodies/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.xml') or file.endswith('.mxl'):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 650


In [3]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [4]:
print('ChordSymbolTokenizer_m21')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer_m21
len(chordSymbolTokenizer.vocab):  456


  return self.iter().getElementsByClass(classFilterList)
  return self.iter().getElementsByClass(classFilterList)
Processing Files:   5%|▍         | 32/650 [00:02<00:52, 11.67it/s]In /home/maximos/anaconda3/envs/midi/lib/python3.11/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle: .flat is deprecated.  Call .flatten() instead
Processing Files:   5%|▌         | 34/650 [00:02<00:50, 12.13it/s]In /home/maximos/anaconda3/envs/midi/lib/python3.11/site-packages/matplotlib/mpl-data/stylelib/seaborn-v0_8-whitegrid.mplstyle: .flat is deprecated.  Call .flatten() instead
In /home/maximos/anaconda3/envs/midi/lib/python3.11/site-packages/matplotlib/mpl-data/stylelib/seaborn-v0_8-dark-palette.mplstyle: .flat is deprecated.  Call .flatten() instead
Processing Files:   6%|▌         | 36/650 [00:02<00:47, 12.90it/s]In /home/maximos/anaconda3/envs/midi/lib/python3.11/site-packages/matplotlib/mpl-data/stylelib/classic.mplstyle: .flat is deprecated.  Call .flatten() instead
  retur

example sentence length:  90
['<h>', '<bar>', 'position_0x00', 'A:min7', '<bar>', 'position_0x00', 'D:7', '<bar>', 'position_0x00', 'A:min7', '<bar>', 'position_0x00', 'D:7', '<bar>', 'position_0x00', 'G:maj7', '<bar>', 'position_0x00', 'C:7(#11)', '<bar>', 'position_0x00', 'B:min7', '<bar>', 'position_0x00', 'E:min7', '<bar>', 'position_0x00', 'B:maj7', 'position_2x00', 'F#:7', '<bar>', 'position_0x00', 'B:maj6', '<bar>', 'position_0x00', 'D:min7', '<bar>', 'position_0x00', 'G:7', '<bar>', 'position_0x00', 'D:min7', '<bar>', 'position_0x00', 'G:7', '<bar>', 'position_0x00', 'C:maj7', '<bar>', 'position_0x00', 'F:7', '<bar>', 'position_0x00', 'E:min7', 'position_2x00', 'A:7', '<bar>', 'position_0x00', 'A:min7', 'position_2x00', 'D:7', '<bar>', 'position_0x00', 'A:min7', '<bar>', 'position_0x00', 'D:7', '<bar>', 'position_0x00', 'G:maj7', 'position_2x00', 'F:7', '<bar>', 'position_0x00', 'E:7(b9)', '<bar>', 'position_0x00', 'A:min7', '<bar>', 'position_0x00', 'F:7', '<bar>', 'position_0




In [5]:
chordSymbolTokenizer.save_pretrained('saved_tokenizers/ChordSymbolTokenizer')
chordSymbolTokenizer.from_pretrained('saved_tokenizers/ChordSymbolTokenizer')
print(chordSymbolTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [6]:
print('RootTypeTokenizer')
rootTypeTokenizer = RootTypeTokenizer()
print('len(rootTypeTokenizer.vocab): ', len(rootTypeTokenizer.vocab))
initialize_stats('RootTypeTokenizer', rootTypeTokenizer)
toks_rt = rootTypeTokenizer(data_files)
print('example sentence length: ', len(toks_rt['tokens'][0]))
print(toks_rt['tokens'][0])
print(toks_rt['ids'][0])
update_stats('RootTypeTokenizer', toks_rt)
print_stats('RootTypeTokenizer')

RootTypeTokenizer
len(rootTypeTokenizer.vocab):  149


Processing Files: 100%|██████████| 650/650 [00:47<00:00, 13.55it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [7]:
rootTypeTokenizer.save_pretrained('saved_tokenizers/RootTypeTokenizer')
rootTypeTokenizer.from_pretrained('saved_tokenizers/RootTypeTokenizer')
print(rootTypeTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [8]:
print('PitchClassTokenizer')
pitchClassTokenizer = PitchClassTokenizer()
print('len(pitchClassTokenizer.vocab): ', len(pitchClassTokenizer.vocab))
initialize_stats('PitchClassTokenizer', pitchClassTokenizer)
toks_pc = pitchClassTokenizer(data_files)
print('example sentence length: ', len(toks_pc['tokens'][0]))
print(toks_pc['tokens'][0])
print(toks_pc['ids'][0])
update_stats('PitchClassTokenizer', toks_pc)
print_stats('PitchClassTokenizer')

PitchClassTokenizer
len(pitchClassTokenizer.vocab):  120


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.51it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [9]:
pitchClassTokenizer.save_pretrained('saved_tokenizers/PitchClassTokenizer')
pitchClassTokenizer.from_pretrained('saved_tokenizers/PitchClassTokenizer')
print(pitchClassTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [10]:
print('RootPCTokenizer')
rootPCTokenizer = RootPCTokenizer()
print('len(rootPCTokenizer.vocab): ', len(rootPCTokenizer.vocab))
initialize_stats('RootPCTokenizer', rootPCTokenizer)
toks_rpc = rootPCTokenizer(data_files)
print('example sentence length: ', len(toks_rpc['tokens'][0]))
print(toks_rpc['tokens'][0])
print(toks_rpc['ids'][0])
update_stats('RootPCTokenizer', toks_rpc)
print_stats('RootPCTokenizer')

RootPCTokenizer
len(rootPCTokenizer.vocab):  132


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.46it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [11]:
rootPCTokenizer.save_pretrained('saved_tokenizers/RootPCTokenizer')
rootPCTokenizer.from_pretrained('saved_tokenizers/RootPCTokenizer')
print(rootPCTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [12]:
print('GCTRootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer()
print('len(gctRootPCTokenizer.vocab): ', len(gctRootPCTokenizer.vocab))
initialize_stats('GCTRootPCTokenizer', gctRootPCTokenizer)
toks_gct_rpc = gctRootPCTokenizer(data_files)
print('example sentence length: ', len(toks_gct_rpc['tokens'][0]))
print(toks_gct_rpc['tokens'][0])
print(toks_gct_rpc['ids'][0])
update_stats('GCTRootPCTokenizer', toks_gct_rpc)
print_stats('GCTRootPCTokenizer')

GCTRootPCTokenizer
len(gctRootPCTokenizer.vocab):  132


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.43it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [13]:
gctRootPCTokenizer.save_pretrained('saved_tokenizers/GCTRootPCTokenizer')
gctRootPCTokenizer.from_pretrained('saved_tokenizers/GCTRootPCTokenizer')
print(gctRootPCTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [14]:
print('GCTSymbolTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer()
print('training')
gctSymbolTokenizer.fit( data_files )
print('len(gctSymbolTokenizer.vocab): ', len(gctSymbolTokenizer.vocab))
initialize_stats('GCTSymbolTokenizer', gctSymbolTokenizer)
toks_gct_symb = gctSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_gct_symb['tokens'][0]))
print(toks_gct_symb['tokens'][0])
print(toks_gct_symb['ids'][0])
update_stats('GCTSymbolTokenizer', toks_gct_symb)
print_stats('GCTSymbolTokenizer')

GCTSymbolTokenizer
training


Processing Files: 100%|██████████| 650/650 [01:04<00:00, 10.07it/s]


len(gctSymbolTokenizer.vocab):  336


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.44it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [15]:
gctSymbolTokenizer.save_pretrained('saved_tokenizers/GCTSymbolTokenizer')
gctSymbolTokenizer.from_pretrained('saved_tokenizers/GCTSymbolTokenizer')
print(gctSymbolTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [16]:
print('GCTRootTypeTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer()
print('training')
gctRootTypeTokenizer.fit( data_files )
print('len(gctRootTypeTokenizer.vocab): ', len(gctRootTypeTokenizer.vocab))
initialize_stats('GCTRootTypeTokenizer', gctRootTypeTokenizer)
toks_gct_rt = gctRootTypeTokenizer(data_files)
print('example sentence length: ', len(toks_gct_rt['tokens'][0]))
print(toks_gct_rt['tokens'][0])
print(toks_gct_rt['ids'][0])
update_stats('GCTRootTypeTokenizer', toks_gct_rt)
print_stats('GCTRootTypeTokenizer')

GCTRootTypeTokenizer
training


Processing Files: 100%|██████████| 650/650 [01:04<00:00, 10.04it/s]


len(gctRootTypeTokenizer.vocab):  165


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.32it/s]

example sentence length:  90
['<h>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', 'position_2x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_0x00', '<unk>', '<bar>', 'position_




In [17]:
gctRootTypeTokenizer.save_pretrained('saved_tokenizers/GCTRootTypeTokenizer')
gctRootTypeTokenizer.from_pretrained('saved_tokenizers/GCTRootTypeTokenizer')
print(gctRootTypeTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<emp>': 4, '<mask>': 5, '<bar>': 6, '<h>': 7, 'position_0x00': 8, 'position_0x16': 9, 'position_0x25': 10, 'position_0x33': 11, 'position_0x50': 12, 'position_0x66': 13, 'position_0x75': 14, 'position_0x83': 15, 'position_1x00': 16, 'position_1x16': 17, 'position_1x25': 18, 'position_1x33': 19, 'position_1x50': 20, 'position_1x66': 21, 'position_1x75': 22, 'position_1x83': 23, 'position_2x00': 24, 'position_2x16': 25, 'position_2x25': 26, 'position_2x33': 27, 'position_2x50': 28, 'position_2x66': 29, 'position_2x75': 30, 'position_2x83': 31, 'position_3x00': 32, 'position_3x16': 33, 'position_3x25': 34, 'position_3x33': 35, 'position_3x50': 36, 'position_3x66': 37, 'position_3x75': 38, 'position_3x83': 39, 'position_4x00': 40, 'position_4x16': 41, 'position_4x25': 42, 'position_4x33': 43, 'position_4x50': 44, 'position_4x66': 45, 'position_4x75': 46, 'position_4x83': 47, 'position_5x00': 48, 'position_5x16': 49, 'position_5x25': 50, 'posit

In [18]:
print('MelodyPitchTokenizer_m21')
melodyPitchTokenizer = MelodyPitchTokenizer(min_pitch=21, max_pitch=108) #default range, need to adjust
print('len(melodyPitchTokenizer.vocab): ', len(melodyPitchTokenizer.vocab))
initialize_stats('MelodyPitchTokenizer', melodyPitchTokenizer)
toks_cs = melodyPitchTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('MelodyPitchTokenizer', toks_cs)
print_stats('MelodyPitchTokenizer')

MelodyPitchTokenizer_m21
len(melodyPitchTokenizer.vocab):  195


Processing Melody Files: 100%|██████████| 650/650 [00:33<00:00, 19.54it/s]


example sentence length:  198
['<s>', '<bar>', 'position_0x00', 'P:69', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:71', 'position_0x75', 'P:71', 'position_1x00', 'P:71', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:69', 'position_1x00', 'P:71', 'position_2x00', 'P:69', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:71', 'position_3x00', 'P:69', '<bar>', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', '<bar>', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', 'position_3x50', 'P:66', '<bar>', 'position_0x00', 'P:59', 'position_1x00', 'P:62', 'position_2x00', 'P:64', 'position_3x00', 'P:67', '<bar>', 'position_0x00', 'P:66', 'position_0x50', 'P:67', 'position_1x00', 'P:64', 'position_3x00', 'P:62', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:66', 'position_2x00', 'P:68', 'position_3x00', 'P:73', '<bar>', 'position_0x00', 'P:71', 'position_0x50', 'P:73

In [19]:
melodyPitchTokenizer.save_pretrained('saved_tokenizers/MelodyPitchTokenizer')
melodyPitchTokenizer.from_pretrained('saved_tokenizers/MelodyPitchTokenizer')
print(melodyPitchTokenizer.vocab)

{'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<rest>': 4, '<mask>': 5, '<bar>': 6, 'P:21': 7, 'P:22': 8, 'P:23': 9, 'P:24': 10, 'P:25': 11, 'P:26': 12, 'P:27': 13, 'P:28': 14, 'P:29': 15, 'P:30': 16, 'P:31': 17, 'P:32': 18, 'P:33': 19, 'P:34': 20, 'P:35': 21, 'P:36': 22, 'P:37': 23, 'P:38': 24, 'P:39': 25, 'P:40': 26, 'P:41': 27, 'P:42': 28, 'P:43': 29, 'P:44': 30, 'P:45': 31, 'P:46': 32, 'P:47': 33, 'P:48': 34, 'P:49': 35, 'P:50': 36, 'P:51': 37, 'P:52': 38, 'P:53': 39, 'P:54': 40, 'P:55': 41, 'P:56': 42, 'P:57': 43, 'P:58': 44, 'P:59': 45, 'P:60': 46, 'P:61': 47, 'P:62': 48, 'P:63': 49, 'P:64': 50, 'P:65': 51, 'P:66': 52, 'P:67': 53, 'P:68': 54, 'P:69': 55, 'P:70': 56, 'P:71': 57, 'P:72': 58, 'P:73': 59, 'P:74': 60, 'P:75': 61, 'P:76': 62, 'P:77': 63, 'P:78': 64, 'P:79': 65, 'P:80': 66, 'P:81': 67, 'P:82': 68, 'P:83': 69, 'P:84': 70, 'P:85': 71, 'P:86': 72, 'P:87': 73, 'P:88': 74, 'P:89': 75, 'P:90': 76, 'P:91': 77, 'P:92': 78, 'P:93': 79, 'P:94': 80, 'P:95': 81, 'P:96': 82, 'P:97':

In [20]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'GCTSymbolTokenizer', \
    'RootTypeTokenizer', 'GCTRootTypeTokenizer',\
    'RootPCTokenizer', 'GCTRootPCTokenizer', \
    'PitchClassTokenizer', 'MelodyPitchTokenizer'
]

results_path = 'vocab_stats_hk_m21.csv' #for hook theory

result_fields = ['Tokenizer_m21', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )

In [4]:
chordSymbolTokenizer = ChordSymbolTokenizer.from_pretrained('saved_tokenizers/ChordSymbolTokenizer')
rootTypeTokenizer = RootTypeTokenizer.from_pretrained('saved_tokenizers/RootTypeTokenizer')
pitchClassTokenizer = PitchClassTokenizer.from_pretrained('saved_tokenizers/PitchClassTokenizer')
rootPCTokenizer = RootPCTokenizer.from_pretrained('saved_tokenizers/RootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer.from_pretrained('saved_tokenizers/GCTRootPCTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer.from_pretrained('saved_tokenizers/GCTSymbolTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer.from_pretrained('saved_tokenizers/GCTRootTypeTokenizer')
melodyPitchTokenizer = MelodyPitchTokenizer.from_pretrained('saved_tokenizers/MelodyPitchTokenizer')

In [5]:
m_chordSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, chordSymbolTokenizer, verbose=1)
m_rootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootTypeTokenizer)
m_pitchClassTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, pitchClassTokenizer)
m_rootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, rootPCTokenizer)
m_gctRootPCTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootPCTokenizer)
m_gctSymbolTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctSymbolTokenizer)
m_gctRootTypeTokenizer = MergedMelHarmTokenizer(melodyPitchTokenizer, gctRootTypeTokenizer)

Merging harmony vocab


In [23]:
# Fixing combined MergedMelHarmTokenizer
print('Length of combined vocab:', len(m_chordSymbolTokenizer.vocab))
print('Combined vocab:', m_chordSymbolTokenizer.vocab)

m_chordSymbolTokenizer.fit( data_files )
toks_symb_m = m_chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_symb_m['tokens'][0]))
print(toks_symb_m['tokens'][0])
print(toks_symb_m['ids'][0])



Length of combined vocab: 883
Combined vocab: {'<unk>': 0, '<pad>': 1, '<s>': 2, '</s>': 3, '<rest>': 4, '<mask>': 5, '<bar>': 6, 'P:21': 7, 'P:22': 8, 'P:23': 9, 'P:24': 10, 'P:25': 11, 'P:26': 12, 'P:27': 13, 'P:28': 14, 'P:29': 15, 'P:30': 16, 'P:31': 17, 'P:32': 18, 'P:33': 19, 'P:34': 20, 'P:35': 21, 'P:36': 22, 'P:37': 23, 'P:38': 24, 'P:39': 25, 'P:40': 26, 'P:41': 27, 'P:42': 28, 'P:43': 29, 'P:44': 30, 'P:45': 31, 'P:46': 32, 'P:47': 33, 'P:48': 34, 'P:49': 35, 'P:50': 36, 'P:51': 37, 'P:52': 38, 'P:53': 39, 'P:54': 40, 'P:55': 41, 'P:56': 42, 'P:57': 43, 'P:58': 44, 'P:59': 45, 'P:60': 46, 'P:61': 47, 'P:62': 48, 'P:63': 49, 'P:64': 50, 'P:65': 51, 'P:66': 52, 'P:67': 53, 'P:68': 54, 'P:69': 55, 'P:70': 56, 'P:71': 57, 'P:72': 58, 'P:73': 59, 'P:74': 60, 'P:75': 61, 'P:76': 62, 'P:77': 63, 'P:78': 64, 'P:79': 65, 'P:80': 66, 'P:81': 67, 'P:82': 68, 'P:83': 69, 'P:84': 70, 'P:85': 71, 'P:86': 72, 'P:87': 73, 'P:88': 74, 'P:89': 75, 'P:90': 76, 'P:91': 77, 'P:92': 78, 'P:93': 7

Processing Melody Files: 100%|██████████| 650/650 [00:33<00:00, 19.31it/s]


Processing harmony


Processing Files: 100%|██████████| 650/650 [00:49<00:00, 13.13it/s]

example sentence length:  288
['<s>', '<bar>', 'position_0x00', 'P:69', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:71', 'position_0x75', 'P:71', 'position_1x00', 'P:71', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:69', 'position_1x00', 'P:71', 'position_2x00', 'P:69', 'position_3x00', 'P:64', '<bar>', 'position_0x00', 'P:71', 'position_3x00', 'P:69', '<bar>', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', '<bar>', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', 'position_3x50', 'P:66', '<bar>', 'position_0x00', 'P:59', 'position_1x00', 'P:62', 'position_2x00', 'P:64', 'position_3x00', 'P:67', '<bar>', 'position_0x00', 'P:66', 'position_0x50', 'P:67', 'position_1x00', 'P:64', 'position_3x00', 'P:62', '<bar>', 'position_0x00', 'P:63', 'position_1x00', 'P:66', 'position_2x00', 'P:68', 'position_3x00', 'P:73', '<bar>', 'position_0x00', 'P:71', 'position_0x50', 'P:73




In [24]:
print(m_chordSymbolTokenizer.convert_tokens_to_ids(['<mask>', 'C:7']))
print(m_chordSymbolTokenizer.convert_tokens_to_ids(['ts_4x4', 'position_7x33']))

[5, 203]
[180, 154]


In [7]:
x = m_chordSymbolTokenizer.encode(data_files[3], max_length=1024, pad_to_max_length=True)
print(x['input_tokens'])
print(x['input_ids'])
print(x['attention_mask'])
print(len(x['input_tokens']))
print(len(x['input_ids']))
print(len(x['attention_mask']))

Processing melody
Processing harmony
['<s>', '<bar>', 'position_0x00', 'P:69', 'position_2x00', 'P:66', '<bar>', 'position_0x00', 'P:66', 'position_2x00', 'P:62', 'position_2x50', 'P:59', 'position_3x00', 'P:62', 'position_3x50', 'P:66', '<bar>', 'position_0x00', 'P:69', 'position_2x00', 'P:66', '<bar>', 'position_0x00', 'P:66', 'position_2x00', 'P:58', 'position_3x00', 'P:62', '<bar>', 'position_0x00', 'P:69', 'position_2x00', 'P:66', '<bar>', 'position_0x00', 'P:66', 'position_2x00', 'P:66', '<bar>', 'position_0x00', 'P:62', '<bar>', 'position_0x00', 'P:62', 'position_2x00', 'P:64', 'position_3x00', 'P:67', '<bar>', 'position_0x00', 'P:74', 'position_2x00', 'P:71', '<bar>', 'position_0x00', 'P:71', 'position_2x00', 'P:67', 'position_2x50', 'P:64', 'position_3x00', 'P:67', 'position_3x50', 'P:71', '<bar>', 'position_0x00', 'P:74', 'position_2x00', 'P:71', '<bar>', 'position_0x00', 'P:71', 'position_2x00', 'P:61', 'position_3x00', 'P:62', '<bar>', 'position_0x00', 'P:69', 'position_2x0

  return self.iter().getElementsByClass(classFilterList)
