In [1]:
import muspy
from harmony_tokenizers import ChordSymbolTokenizer, RootTypeTokenizer, \
    PitchClassTokenizer, RootPCTokenizer, \
    GCTRootPCTokenizer, GCTSymbolTokenizer, GCTRootTypeTokenizer
from tqdm import tqdm
import os
import numpy as np
import zlib
import csv

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
gjt_path = 'data/gjt_melodies/Library_melodies/'
gjt_list = os.listdir(gjt_path)
print(len(gjt_list))

650


In [3]:
gjt_pieces = []
for i in tqdm(range(len(gjt_list))):
    g = muspy.read_musicxml(gjt_path + gjt_list[i])
    gjt_pieces.append(g)

100%|██████████| 650/650 [00:09<00:00, 69.19it/s]


In [4]:
print(len(gjt_pieces))

650


In [5]:
# prepare stats
stats = {}

def compute_compression_rate(array: np.ndarray, compression_method=zlib.compress) -> float:
    """
    Compute the compression rate of a NumPy array.

    Parameters:
        array (np.ndarray): The NumPy array to compress.
        compression_method (callable): The compression method to use. 
                                       Default is `zlib.compress`.

    Returns:
        float: The compression rate (compressed size / original size).
    """
    # Convert the array to bytes
    array_bytes = array.tobytes()
    
    # Compress the byte representation
    compressed_bytes = compression_method(array_bytes)
    
    # Compute sizes
    original_size = len(array_bytes)
    compressed_size = len(compressed_bytes)
    
    # Calculate compression rate
    compression_rate = compressed_size / original_size

    return compression_rate

def initialize_stats(key, tokenizer):
    stats[key] = {
        'vocab_size': len(tokenizer.vocab),
        'seq_lens': [],
        'compression_rates': []
    }
# end initialize_stats

def update_stats(key, toks):
    for t in toks['ids']:
        stats[key]['seq_lens'].append( len(t) )
        stats[key]['compression_rates'].append( compute_compression_rate(np.array(t)) )
    stats[key]['mean_len'] = np.mean(stats[key]['seq_lens'])
    stats[key]['std_len'] = np.std(stats[key]['seq_lens'])
    stats[key]['mean_compression'] = np.mean(stats[key]['compression_rates'])
    stats[key]['std_compression'] = np.std(stats[key]['compression_rates'])
# end update_stats

def print_stats(key):
    print('vocab_size: ', stats[key]['vocab_size'])
    print('mean len: ', stats[key]['mean_len'])
    print('std len: ', stats[key]['std_len'])
    print('mean cr: ', stats[key]['mean_compression'])
    print('std cr: ', stats[key]['std_compression'])

In [6]:
print('ChordSymbolTokenizer')
chordSymbolTokenizer = ChordSymbolTokenizer()
print('len(chordSymbolTokenizer.vocab): ', len(chordSymbolTokenizer.vocab))
initialize_stats('ChordSymbolTokenizer', chordSymbolTokenizer)
toks_cs = chordSymbolTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_cs['tokens'][0]))
print(toks_cs['tokens'][0])
print(toks_cs['ids'][0])
update_stats('ChordSymbolTokenizer', toks_cs)
print_stats('ChordSymbolTokenizer')

ChordSymbolTokenizer
len(chordSymbolTokenizer.vocab):  322
example sentence length:  104
['bar', 'position_0x0', 'A:min7', 'bar', 'position_0x0', 'D:7', 'bar', 'position_0x0', 'A:min7', 'bar', 'position_0x0', 'D:7', 'bar', 'position_0x0', 'G:maj7', 'bar', 'position_0x0', 'C:7', 'bar', 'position_0x0', 'B:min7', 'bar', 'position_0x0', 'E:min7', 'bar', 'position_0x0', 'A:min7', 'bar', 'position_0x0', 'D:7', 'bar', 'position_0x0', 'A:min7', 'bar', 'position_0x0', 'D:7', 'bar', 'position_0x0', 'G:maj7', 'bar', 'position_0x0', 'C:7', 'bar', 'position_0x0', 'E:min7', 'bar', 'position_0x0', 'B:maj6', 'bar', 'position_0x0', 'D:min7', 'bar', 'position_0x0', 'G:7', 'bar', 'position_0x0', 'D:min7', 'bar', 'position_0x0', 'G:7', 'bar', 'position_0x0', 'C:maj7', 'bar', 'position_0x0', 'F:7', 'bar', 'position_0x0', 'E:min7', 'position_2x0', 'A:7', 'bar', 'position_0x0', 'A:min7', 'position_1x0', 'D:7', 'bar', 'position_0x0', 'A:min7', 'bar', 'position_0x0', 'D:7', 'bar', 'position_0x0', 'G:maj7', 'po

In [7]:
print('RootTypeTokenizer')
rootTypeTokenizer = RootTypeTokenizer()
print('len(rootTypeTokenizer.vocab): ', len(rootTypeTokenizer.vocab))
initialize_stats('RootTypeTokenizer', rootTypeTokenizer)
toks_rt = rootTypeTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_rt['tokens'][0]))
print(toks_rt['tokens'][0])
print(toks_rt['ids'][0])
update_stats('RootTypeTokenizer', toks_rt)
print_stats('RootTypeTokenizer')

RootTypeTokenizer
len(rootTypeTokenizer.vocab):  59
example sentence length:  140
['bar', 'position_0x0', 'A', 'min7', 'bar', 'position_0x0', 'D', '7', 'bar', 'position_0x0', 'A', 'min7', 'bar', 'position_0x0', 'D', '7', 'bar', 'position_0x0', 'G', 'maj7', 'bar', 'position_0x0', 'C', '7', 'bar', 'position_0x0', 'B', 'min7', 'bar', 'position_0x0', 'E', 'min7', 'bar', 'position_0x0', 'A', 'min7', 'bar', 'position_0x0', 'D', '7', 'bar', 'position_0x0', 'A', 'min7', 'bar', 'position_0x0', 'D', '7', 'bar', 'position_0x0', 'G', 'maj7', 'bar', 'position_0x0', 'C', '7', 'bar', 'position_0x0', 'E', 'min7', 'bar', 'position_0x0', 'B', 'maj6', 'bar', 'position_0x0', 'D', 'min7', 'bar', 'position_0x0', 'G', '7', 'bar', 'position_0x0', 'D', 'min7', 'bar', 'position_0x0', 'G', '7', 'bar', 'position_0x0', 'C', 'maj7', 'bar', 'position_0x0', 'F', '7', 'bar', 'position_0x0', 'E', 'min7', 'position_2x0', 'A', '7', 'bar', 'position_0x0', 'A', 'min7', 'position_1x0', 'D', '7', 'bar', 'position_0x0', 'A', 

In [8]:
print('PitchClassTokenizer')
pitchClassTokenizer = PitchClassTokenizer()
print('len(pitchClassTokenizer.vocab): ', len(pitchClassTokenizer.vocab))
initialize_stats('PitchClassTokenizer', pitchClassTokenizer)
toks_pc = pitchClassTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_pc['tokens'][0]))
print(toks_pc['tokens'][0])
print(toks_pc['ids'][0])
update_stats('PitchClassTokenizer', toks_pc)
print_stats('PitchClassTokenizer')

PitchClassTokenizer
len(pitchClassTokenizer.vocab):  34


example sentence length:  212
['bar', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_pc_7', 'chord_pc_11', 'chord_pc_2', 'chord_pc_6', 'bar', 'position_0x0', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_10', 'bar', 'position_0x0', 'chord_pc_11', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'bar', 'position_0x0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_11', 'chord_pc_2', 'bar', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_pc_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_

In [9]:
print('RootPCTokenizer')
rootPCTokenizer = RootPCTokenizer()
print('len(rootPCTokenizer.vocab): ', len(rootPCTokenizer.vocab))
initialize_stats('RootPCTokenizer', rootPCTokenizer)
toks_rpc = rootPCTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_rpc['tokens'][0]))
print(toks_rpc['tokens'][0])
print(toks_rpc['ids'][0])
update_stats('RootPCTokenizer', toks_rpc)
print_stats('RootPCTokenizer')

RootPCTokenizer
len(rootPCTokenizer.vocab):  46
example sentence length:  212
['bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_7', 'chord_pc_11', 'chord_pc_2', 'chord_pc_6', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_10', 'bar', 'position_0x0', 'chord_root_11', 'chord_pc_2', 'chord_pc_6', 'chord_pc_9', 'bar', 'position_0x0', 'chord_root_4', 'chord_pc_7', 'chord_pc_11', 'chord_pc_2', 'bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_9', 'chord_pc_0', 'chord_pc_4', 'chord_pc_7', 'bar', 'position_0x0', 'ch

In [10]:
print('GCTRootPCTokenizer')
gctRootPCTokenizer = GCTRootPCTokenizer()
print('len(gctRootPCTokenizer.vocab): ', len(gctRootPCTokenizer.vocab))
initialize_stats('GCTRootPCTokenizer', gctRootPCTokenizer)
toks_gct_rpc = gctRootPCTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_gct_rpc['tokens'][0]))
print(toks_gct_rpc['tokens'][0])
print(toks_gct_rpc['ids'][0])
update_stats('GCTRootPCTokenizer', toks_gct_rpc)
print_stats('GCTRootPCTokenizer')

GCTRootPCTokenizer
len(gctRootPCTokenizer.vocab):  46
example sentence length:  212
['bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_9', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_9', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_7', 'chord_pc_11', 'chord_pc_2', 'chord_pc_6', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_10', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_11', 'bar', 'position_0x0', 'chord_root_7', 'chord_pc_11', 'chord_pc_2', 'chord_pc_4', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_9', 'bar', 'position_0x0', 'chord_root_2', 'chord_pc_6', 'chord_pc_9', 'chord_pc_0', 'bar', 'position_0x0', 'chord_root_0', 'chord_pc_4', 'chord_pc_7', 'chord_pc_9', 'bar', 'position_0x0

In [11]:
print('GCTSymbolTokenizer')
gctSymbolTokenizer = GCTSymbolTokenizer()
print('training')
gctSymbolTokenizer.fit( gjt_pieces )
print('len(gctSymbolTokenizer.vocab): ', len(gctSymbolTokenizer.vocab))
initialize_stats('GCTSymbolTokenizer', gctSymbolTokenizer)
toks_gct_symb = gctSymbolTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_gct_symb['tokens'][0]))
print(toks_gct_symb['tokens'][0])
print(toks_gct_symb['ids'][0])
update_stats('GCTSymbolTokenizer', toks_gct_symb)
print_stats('GCTSymbolTokenizer')

GCTSymbolTokenizer
training
len(gctSymbolTokenizer.vocab):  210
example sentence length:  104
['bar', 'position_0x0', '[0 0 4 7 9]', 'bar', 'position_0x0', '[ 2  0  4  7 10]', 'bar', 'position_0x0', '[0 0 4 7 9]', 'bar', 'position_0x0', '[ 2  0  4  7 10]', 'bar', 'position_0x0', '[ 7  0  4  7 11]', 'bar', 'position_0x0', '[ 0  0  4  7 10]', 'bar', 'position_0x0', '[2 0 4 7 9]', 'bar', 'position_0x0', '[7 0 4 7 9]', 'bar', 'position_0x0', '[0 0 4 7 9]', 'bar', 'position_0x0', '[ 2  0  4  7 10]', 'bar', 'position_0x0', '[0 0 4 7 9]', 'bar', 'position_0x0', '[ 2  0  4  7 10]', 'bar', 'position_0x0', '[ 7  0  4  7 11]', 'bar', 'position_0x0', '[ 0  0  4  7 10]', 'bar', 'position_0x0', '[7 0 4 7 9]', 'bar', 'position_0x0', '[ 8  0  3  7 10]', 'bar', 'position_0x0', '[5 0 4 7 9]', 'bar', 'position_0x0', '[ 7  0  4  7 10]', 'bar', 'position_0x0', '[5 0 4 7 9]', 'bar', 'position_0x0', '[ 7  0  4  7 10]', 'bar', 'position_0x0', '[4 0 3 7 8]', 'bar', 'position_0x0', '[ 5  0  4  7 10]', 'bar', 'p

In [12]:
print('GCTRootTypeTokenizer')
gctRootTypeTokenizer = GCTRootTypeTokenizer()
print('training')
gctRootTypeTokenizer.fit( gjt_pieces )
print('len(gctRootTypeTokenizer.vocab): ', len(gctRootTypeTokenizer.vocab))
initialize_stats('GCTRootTypeTokenizer', chordSymbolTokenizer)
toks_gct_rt = gctRootTypeTokenizer(gjt_pieces)
print('example sentence length: ', len(toks_gct_rt['tokens'][0]))
print(toks_gct_rt['tokens'][0])
print(toks_gct_rt['ids'][0])
update_stats('GCTRootTypeTokenizer', toks_gct_rt)
print_stats('GCTRootTypeTokenizer')

GCTRootTypeTokenizer
training
len(gctRootTypeTokenizer.vocab):  68
example sentence length:  140
['bar', 'position_0x0', 'chord_root_0', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_2', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_2', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_7', '[ 0  4  7 11]', 'bar', 'position_0x0', 'chord_root_0', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_2', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_7', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_2', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_0', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_2', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_7', '[ 0  4  7 11]', 'bar', 'position_0x0', 'chord_root_0', '[ 0  4  7 10]', 'bar', 'position_0x0', 'chord_root_7', '[0 4 7 9]', 'bar', 'position_0x0', 'chord_root_8', '[ 0  3  7 10]', 'bar', 'position_0x0', 'chord_root

In [13]:
# print stats
tokenizers = ['ChordSymbolTokenizer', 'GCTSymbolTokenizer',\
              'RootTypeTokenizer', 'GCTRootTypeTokenizer',\
              'PitchClassTokenizer', 'RootPCTokenizer', 'GCTRootPCTokenizer'
              ]

results_path = 'vocab_stats.csv'

result_fields = ['Tokenizer', 'vocab_size'] + list( stats['ChordSymbolTokenizer'].keys() )[3:]

with open( results_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow( result_fields )

for tok in tokenizers:
    with open( results_path, 'a' ) as f:
            writer = csv.writer(f)
            writer.writerow( [tok] + [stats[tok]['vocab_size']] + list( stats[tok].values() )[3:] )