In [15]:
import os
import csv
import zlib
import muspy as mp
import numpy as np
from tqdm import tqdm
from harmony_tokenizers import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, \
    GCTRootPCTokenizer, GCTSymbolTokenizer, GCTRootTypeTokenizer

In [2]:
root_dir = './hooktheory_dataset/xmls/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".xml"):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 17476


In [3]:
#load files
hk_pieces = []
for i in tqdm(range(len(data_files))):
    try:
        g = mp.read_musicxml(data_files[i])
        hk_pieces.append(g)
    except Exception as e:
        #catch very rare chord exceptions
        print(f"Error processing file: {data_files[i]}")
        print(f"Error: {e}")        

print('Total files processed: ', len(hk_pieces))

 34%|███▎      | 5864/17476 [00:26<00:59, 195.35it/s]

Error processing file: ./hooktheory_dataset/xmls/h\hiroshi-miyagawa\space-battleship-yamato---autoplanet-goruba\verse.xml
Error: '7(b5)'


100%|██████████| 17476/17476 [02:12<00:00, 131.71it/s]

Total files processed:  17475





In [4]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [5]:
print('ChordSymbolTokenizer')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(hk_pieces)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer
len(chordSymbolTokenizer.vocab):  326
example sentence length:  82
['bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'A:min', 'position_0x0', 'A:min', 'position_0x0', 'F:maj', 'position_3x0', 'F:maj', 'position_3x0', 'F:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'A:min', 'position_0x0', 'A:min', 'position_0x0', 'F:maj', 'bar', 'position_0x0', 'F:maj', 'position_0x0', 'F:maj', 'position_0x0', 'F:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'F:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'A:min', 'bar',

In [6]:
print('RootTypeTokenizer')
rootTypeTokenizer = RootTypeTokenizer()
print('len(rootTypeTokenizer.vocab): ', len(rootTypeTokenizer.vocab))
initialize_stats('RootTypeTokenizer', rootTypeTokenizer)
toks_rt = rootTypeTokenizer(hk_pieces)
print('example sentence length: ', len(toks_rt['tokens'][0]))
print(toks_rt['tokens'][0])
print(toks_rt['ids'][0])
update_stats('RootTypeTokenizer', toks_rt)
print_stats('RootTypeTokenizer')

RootTypeTokenizer
len(rootTypeTokenizer.vocab):  63
example sentence length:  115
['bar', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'position_0x0', 'A', 'min', 'bar', 'position_0x0', 'A', 'min', 'position_0x0', 'A', 'min', 'position_0x0', 'F', 'maj', 'position_3x0', 'F', 'maj', 'position_3x0', 'F', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'position_0x0', 'C', 'maj', 'position_0x0', 'A', 'min', 'bar', 'position_0x0', 'A', 'min', 'position_0x0', 'A', 'min', 'position_0x0', 'F', 'maj', 'bar', 'position_0x0', 'F', 'maj', 'position_0x0', 'F', 'maj', 'position_0x0', 'F', 'maj', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'C', 'maj', 'bar', 'position_0x0', 'A', 'min', 'bar', 'position_0x0', 'F', 'maj', 'bar', 'position_0x0'

In [8]:
print('PitchClassTokenizer')
pitchClassTokenizer = PitchClassTokenizer()
print('len(pitchClassTokenizer.vocab): ', len(pitchClassTokenizer.vocab))
initialize_stats('PitchClassTokenizer', pitchClassTokenizer)
toks_pc = pitchClassTokenizer(hk_pieces)
print('example sentence length: ', len(toks_pc['tokens'][0]))
print(toks_pc['tokens'][0])
print(toks_pc['ids'][0])
update_stats('PitchClassTokenizer', toks_pc)
print_stats('PitchClassTokenizer')

PitchClassTokenizer
len(pitchClassTokenizer.vocab):  38
example sentence length:  148
['bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'bar', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_pc_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_pc_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_pc_5', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', '

In [10]:
print('RootPCTokenizer')
rootPCTokenizer = RootPCTokenizer()
print('len(rootPCTokenizer.vocab): ', len(rootPCTokenizer.vocab))
initialize_stats('RootPCTokenizer', rootPCTokenizer)
toks_rpc = rootPCTokenizer(hk_pieces)
print('example sentence length: ', len(toks_rpc['tokens'][0]))
print(toks_rpc['tokens'][0])
print(toks_rpc['ids'][0])
update_stats('RootPCTokenizer', toks_rpc)
print_stats('RootPCTokenizer')

RootPCTokenizer
len(rootPCTokenizer.vocab):  50
example sentence length:  148
['bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_

In [12]:
print('GCTRootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer()
print('len(gctRootPCTokenizer.vocab): ', len(gctRootPCTokenizer.vocab))
initialize_stats('GCTRootPCTokenizer', gctRootPCTokenizer)
toks_gct_rpc = gctRootPCTokenizer(hk_pieces)
print('example sentence length: ', len(toks_gct_rpc['tokens'][0]))
print(toks_gct_rpc['tokens'][0])
print(toks_gct_rpc['ids'][0])
update_stats('GCTRootPCTokenizer', toks_gct_rpc)
print_stats('GCTRootPCTokenizer')

GCTRootPCTokenizer
len(gctRootPCTokenizer.vocab):  50
example sentence length:  148
['bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'position_0x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'position_3x0', 'chord_root_5', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'position_0x0', 'chord_root_0', '

In [13]:
print('GCTSymbolTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer()
print('training')
gctSymbolTokenizer.fit( hk_pieces )
print('len(gctSymbolTokenizer.vocab): ', len(gctSymbolTokenizer.vocab))
initialize_stats('GCTSymbolTokenizer', gctSymbolTokenizer)
toks_gct_symb = gctSymbolTokenizer(hk_pieces)
print('example sentence length: ', len(toks_gct_symb['tokens'][0]))
print(toks_gct_symb['tokens'][0])
print(toks_gct_symb['ids'][0])
update_stats('GCTSymbolTokenizer', toks_gct_symb)
print_stats('GCTSymbolTokenizer')

GCTSymbolTokenizer
training
len(gctSymbolTokenizer.vocab):  194
example sentence length:  82
['bar', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[9 0 3 7]', 'bar', 'position_0x0', '[9 0 3 7]', 'position_0x0', '[9 0 3 7]', 'position_0x0', '[5 0 4 7]', 'position_3x0', '[5 0 4 7]', 'position_3x0', '[5 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[0 0 4 7]', 'position_0x0', '[9 0 3 7]', 'bar', 'position_0x0', '[9 0 3 7]', 'position_0x0', '[9 0 3 7]', 'position_0x0', '[5 0 4 7]', 'bar', 'position_0x0', '[5 0 4 7]', 'position_0x0', '[5 0 4 7]', 'position_0x0', '[5 0 4 7]', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[0 0 4 7]', 'bar', 'position_0x0', '[9 0 3 7]', 'bar', 'position_

In [14]:
print('GCTRootTypeTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer()
print('training')
gctRootTypeTokenizer.fit( hk_pieces )
print('len(gctRootTypeTokenizer.vocab): ', len(gctRootTypeTokenizer.vocab))
initialize_stats('GCTRootTypeTokenizer', chordSymbolTokenizer)
toks_gct_rt = gctRootTypeTokenizer(hk_pieces)
print('example sentence length: ', len(toks_gct_rt['tokens'][0]))
print(toks_gct_rt['tokens'][0])
print(toks_gct_rt['ids'][0])
update_stats('GCTRootTypeTokenizer', toks_gct_rt)
print_stats('GCTRootTypeTokenizer')

GCTRootTypeTokenizer
training
len(gctRootTypeTokenizer.vocab):  67
example sentence length:  115
['bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_9', '[0 3 7]', 'bar', 'position_0x0', 'chord_root_9', '[0 3 7]', 'position_0x0', 'chord_root_9', '[0 3 7]', 'position_0x0', 'chord_root_5', '[0 4 7]', 'position_3x0', 'chord_root_5', '[0 4 7]', 'position_3x0', 'chord_root_5', '[0 4 7]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_0', '[0 4 7]', 'position_0x0', 'chord_root_9', '[0 3 7]', 'bar', 'position_0x0', 'chord_root_9

In [16]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'GCTSymbolTokenizer',\
              'RootTypeTokenizer', 'GCTRootTypeTokenizer',\
              'PitchClassTokenizer', 'RootPCTokenizer', 'GCTRootPCTokenizer'
              ]

results_path = 'vocab_stats_hk.csv' #for hook theory

result_fields = ['Tokenizer', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )