In [1]:
import os
import csv
import zlib
import numpy as np
from harmony_tokenizers_m21 import ChordSymbolTokenizer, MelodyPitchTokenizer

In [2]:
root_dir = './data/hooktheory_dataset/xmls/'
data_files = []

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".xml"):
            full_path = os.path.join(dirpath, file)
            data_files.append(full_path)

print('Total files from Hook Theory dataset:', len(data_files))

Total files from Hook Theory dataset: 17476


In [3]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [5]:
print('ChordSymbolTokenizer_m21')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer_m21
len(chordSymbolTokenizer.vocab):  396


Processing Files: 100%|██████████| 17476/17476 [05:05<00:00, 57.13it/s]


example sentence length:  82
['bar', 'position_0x00', 'C:maj', 'position_2x00', 'C:maj', 'position_3x00', 'C:maj', 'bar', 'position_0x50', 'C:maj', 'position_2x00', 'C:maj', 'position_3x00', 'C:maj', 'bar', 'position_0x00', 'A:min', 'position_2x00', 'A:min', 'position_3x00', 'A:min', 'bar', 'position_0x00', 'F:maj', 'position_1x00', 'F:maj', 'position_2x00', 'F:maj', 'bar', 'position_0x00', 'C:maj', 'position_2x00', 'C:maj', 'position_3x00', 'C:maj', 'bar', 'position_0x50', 'C:maj', 'position_2x00', 'C:maj', 'position_3x00', 'C:maj', 'bar', 'position_0x00', 'A:min', 'position_2x00', 'A:min', 'position_3x00', 'A:min', 'bar', 'position_0x00', 'F:maj', 'position_1x00', 'F:maj', 'position_2x00', 'F:maj', 'position_3x00', 'F:maj', 'bar', 'position_0x00', 'C:maj', 'bar', 'position_0x00', 'C:maj', 'bar', 'position_0x00', 'A:min', 'bar', 'position_0x00', 'F:maj', 'bar', 'position_0x00', 'C:maj', 'bar', 'position_0x00', 'C:maj', 'bar', 'position_0x00', 'A:min', 'bar', 'position_0x00', 'F:maj']


In [6]:
print('MelodyPitchTokenizer_m21')
melodyPitchTokenizer = MelodyPitchTokenizer(min_pitch=21, max_pitch=108) #default range, need to adjust
print('len(melodyPitchTokenizer.vocab): ', len(melodyPitchTokenizer.vocab))
initialize_stats('MelodyPitchTokenizer', melodyPitchTokenizer)
toks_cs = melodyPitchTokenizer(data_files)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('MelodyPitchTokenizer', toks_cs)
print_stats('MelodyPitchTokenizer')

MelodyPitchTokenizer_m21
len(melodyPitchTokenizer.vocab):  184


Processing Melody Files: 100%|██████████| 17476/17476 [06:44<00:00, 43.21it/s]


example sentence length:  148
['bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'position_3x00', 'P:41', 'position_3x25', 'P:43', 'position_3x50', 'P:47', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'bar', 'position_0x00', 'Rest', 'position_1x00', 'P:67', 'position_1x50', 'P:67', 'position_2x00', 'P:67', 'position_2x50', 'P:67', 'position_3x00', 'P:67', 'position_3x25', 'P:67', 'position_3x50', 'P:67', 'bar', 'position_0x00', 'P:64', 'position_0x50', 'P:62', 'position_1x00', 'P:60', 'position_1x50', 'P:62', 'position_1x75', 'P:64', 'position_2x50', 'P:60', 'position_3x00', 'P:60', 'position_3x50', 'P:60', 'bar', 'position_0x00', 'P:64', 'position_0x50', 'P:64', 'position_1x00', 'P:64', 'position_1x50', 'P:62', 'position_2x50', 'P:60', 'position_3x00', 'P:62', 'position_3x50', 'P:60', 'bar', 'position_0x00', 'P:62', 'position_

In [7]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'MelodyPitchTokenizer'
              ]

results_path = 'vocab_stats_hk_m21.csv' #for hook theory

result_fields = ['Tokenizer_m21', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )