In [5]:
import os
import csv
import zlib
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, MelodyPitchTokenizer

In [6]:
# root_dir = '/media/datadisk/datasets/hooktheory_xmls'
root_dir = 'data/gjt_melodies/Library_melodies/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.xml') or file.endswith('.mxl'):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 650


In [7]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [8]:
print('ChordSymbolTokenizer_m21')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer_m21
len(chordSymbolTokenizer.vocab):  444


Processing Files:  79%|███████▉  | 516/650 [00:38<00:10, 13.33it/s]

<music21.harmony.ChordSymbol B#9 alter #5>
B#
whole-tone pentachord
File 'data/gjt_melodies/Library_melodies/You_re_Everything.mxl' generated 1 'unk' tokens.


Processing Files: 100%|██████████| 650/650 [00:48<00:00, 13.49it/s]

example sentence length:  88
['bar', 'position_0x00', 'A:min7', 'bar', 'position_0x00', 'D:7', 'bar', 'position_0x00', 'A:min7', 'bar', 'position_0x00', 'D:7', 'bar', 'position_0x00', 'G:maj7', 'bar', 'position_0x00', 'C:7(#11)', 'bar', 'position_0x00', 'B:min7', 'bar', 'position_0x00', 'E:min7', 'bar', 'position_0x00', 'B:maj7', 'position_2x00', 'F#:7', 'bar', 'position_0x00', 'B:maj6', 'bar', 'position_0x00', 'D:min7', 'bar', 'position_0x00', 'G:7', 'bar', 'position_0x00', 'D:min7', 'bar', 'position_0x00', 'G:7', 'bar', 'position_0x00', 'C:maj7', 'bar', 'position_0x00', 'F:7', 'bar', 'position_0x00', 'E:min7', 'position_2x00', 'A:7', 'bar', 'position_0x00', 'A:min7', 'position_2x00', 'D:7', 'bar', 'position_0x00', 'A:min7', 'bar', 'position_0x00', 'D:7', 'bar', 'position_0x00', 'G:maj7', 'position_2x00', 'F:7', 'bar', 'position_0x00', 'E:7(b9)', 'bar', 'position_0x00', 'A:min7', 'bar', 'position_0x00', 'F:7', 'bar', 'position_0x00', 'A:min7', 'position_2x00', 'D:7', 'bar', 'position_




In [8]:
print('MelodyPitchTokenizer_m21')
melodyPitchTokenizer = MelodyPitchTokenizer(min_pitch=21, max_pitch=108) #default range, need to adjust
print('len(melodyPitchTokenizer.vocab): ', len(melodyPitchTokenizer.vocab))
initialize_stats('MelodyPitchTokenizer', melodyPitchTokenizer)
toks_cs = melodyPitchTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('MelodyPitchTokenizer', toks_cs)
print_stats('MelodyPitchTokenizer')

MelodyPitchTokenizer_m21
len(melodyPitchTokenizer.vocab):  184


Processing Melody Files: 100%|██████████| 650/650 [00:32<00:00, 19.75it/s]

example sentence length:  196
['bar', 'position_0x00', 'P:69', 'position_3x00', 'P:64', 'bar', 'position_0x00', 'P:71', 'position_0x75', 'P:71', 'position_1x00', 'P:71', 'position_3x00', 'P:64', 'bar', 'position_0x00', 'P:69', 'position_1x00', 'P:71', 'position_2x00', 'P:69', 'position_3x00', 'P:64', 'bar', 'position_0x00', 'P:71', 'position_3x00', 'P:69', 'bar', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', 'bar', 'position_0x00', 'P:66', 'position_1x00', 'P:67', 'position_1x50', 'P:64', 'position_2x00', 'P:64', 'position_3x50', 'P:66', 'bar', 'position_0x00', 'P:59', 'position_1x00', 'P:62', 'position_2x00', 'P:64', 'position_3x00', 'P:67', 'bar', 'position_0x00', 'P:66', 'position_0x50', 'P:67', 'position_1x00', 'P:64', 'position_3x00', 'P:62', 'bar', 'position_0x00', 'P:63', 'position_1x00', 'P:66', 'position_2x00', 'P:68', 'position_3x00', 'P:73', 'bar', 'position_0x00', 'P:71', 'position_0x50', 'P:73', 'position_1x00', 'P:71',




In [9]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'MelodyPitchTokenizer'
              ]

results_path = 'vocab_stats_hk_m21.csv' #for hook theory

result_fields = ['Tokenizer_m21', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )