In [1]:
import os
import zlib
import muspy as mp
import numpy as np
from tqdm import tqdm
from harmony_tokenizers import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, \
    GCTRootPCTokenizer, GCTSymbolTokenizer, GCTRootTypeTokenizer

In [2]:
root_dir = './hooktheory_dataset/xmls/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".xml"):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 17476


In [3]:
#load files
hk_pieces = []
for i in tqdm(range(len(data_files))):
    try:
        g = mp.read_musicxml(data_files[i])
        hk_pieces.append(g)
    except Exception as e:
        #catch very rare chord exceptions
        print(f"Error processing file: {data_files[i]}")
        print(f"Error: {e}")        

print('Total files processed: ', len(hk_pieces))

 34%|███▎      | 5880/17476 [00:30<00:58, 199.65it/s]

Error processing file: ./hooktheory_dataset/xmls/h\hiroshi-miyagawa\space-battleship-yamato---autoplanet-goruba\verse.xml
Error: '7(b5)'


100%|██████████| 17476/17476 [01:38<00:00, 178.09it/s]

Total files processed:  17475





In [4]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [None]:
print('ChordSymbolTokenizer')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(hk_pieces)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer
len(chordSymbolTokenizer.vocab):  326
example sentence length:  82
['bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'A:min', 'position_0x0', 'A:min', 'position_0x0', 'F:maj', 'position_3x0', 'F:maj', 'position_3x0', 'F:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'C:maj', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'A:min', 'position_0x0', 'A:min', 'position_0x0', 'F:maj', 'bar', 'position_0x0', 'F:maj', 'position_0x0', 'F:maj', 'position_0x0', 'F:maj', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'A:min', 'bar', 'position_0x0', 'F:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'C:maj', 'bar', 'position_0x0', 'A:min', 'bar',

In [None]:
print('RootTypeTokenizer')
rootTypeTokenizer = RootTypeTokenizer()
print('len(rootTypeTokenizer.vocab): ', len(rootTypeTokenizer.vocab))
initialize_stats('RootTypeTokenizer', rootTypeTokenizer)
toks_rt = rootTypeTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_rt['tokens'][0]))
print(toks_rt['tokens'][0])
print(toks_rt['ids'][0])
update_stats('RootTypeTokenizer', toks_rt)
print_stats('RootTypeTokenizer')