In [1]:
import pandas as pd
import json

In [2]:
def symbol_basic_eval(df, tokenizer_name='ChordSymbolTokenizer'):
    total_pieces = len(df['labels'])

    correct_bar_predictions = 0
    total_bars = 0

    correct_new_chord_predictions = 0
    total_new_chord_predictions = 0

    correct_position_predictions = 0
    total_position_predictions = 0

    correct_chord_predictions = 0
    correct_root_predictions = 0
    total_chord_predictions = 0

    for p_i in range(total_pieces):
        l = df['labels'].iloc[p_i]
        p = df['predictions'].iloc[p_i]
        # for each token that should have been predicted
        l_split = l.split(' ')
        p_split = p.split(' ')
        # arm root and type for RootType tokenizers
        arm_root = False
        tmp_root = None
        tmp_type = None
        # keep a chord buffer to accumulate elements of chords for 
        # non single-word chord representations
        l_chord_buffer = []
        p_chord_buffer = []
        i = 0
        while i < len (l_split):
            # how many bars were correctly predicted
            if l_split[i] == '<bar>':
                total_bars += 1
                arm_root = False
                if p_split[i] == '<bar>':
                    correct_bar_predictions += 1
            # how many new chords were correctly predicted
            if 'position_' in l_split[i]:
                total_new_chord_predictions += 1
                arm_root = True
                if 'position_' in p_split[i]:
                    correct_new_chord_predictions += 1
            # how many correct positions were predicted
            if 'position_' in l_split[i]:
                total_position_predictions += 1
                if p_split[i] == l_split[i]:
                    correct_position_predictions += 1
            # how many exact chords and roots were predicted
            if tokenizer_name == 'ChordSymbolTokenizer':
                if ':' in l_split[i]:
                    total_chord_predictions += 1
                    if p_split[i] == l_split[i]:
                        correct_chord_predictions += 1
                    l_chord_split = l_split[i].split(':')
                    p_chord_split = p_split[i].split(':')
                    if l_chord_split[0] == p_chord_split[0]:
                        correct_root_predictions += 1
            elif tokenizer_name == 'GCTSymbolTokenizer':
                if '[' in l_split[i]:
                    total_chord_predictions += 1
                    if p_split[i] == l_split[i]:
                        correct_chord_predictions += 1
                    l_chord_split = l_split[i][1:].split('x')
                    p_chord_split = p_split[i][1:].split('x')
                    if l_chord_split[0] == p_chord_split[0]:
                        correct_root_predictions += 1
            elif tokenizer_name == 'RootTypeTokenizer' or tokenizer_name == 'GCTRootTypeTokenizer':
                if arm_root:
                    total_chord_predictions += 1
                    # progress to root
                    i += 1
                    tmp_correct_root = False
                    if i < len(l_split):
                        if p_split[i] == l_split[i]:
                            correct_root_predictions += 1
                            tmp_correct_root = True
                    # progress to type
                    i += 1
                    if i < len(l_split):
                        if p_split[i] == l_split[i] and tmp_correct_root:
                            correct_chord_predictions += 1
            elif tokenizer_name == 'RootPCTokenizer' or tokenizer_name == 'GCTRootPCTokenizer':
                if arm_root:
                    total_chord_predictions += 1
                    # progress to root
                    i += 1
                    tmp_correct_root = False
                    if i < len(l_split):
                        if p_split[i] == l_split[i]:
                            correct_root_predictions += 1
                            tmp_correct_root = True
                    # progress to type
                    i += 1
                    while i < len(l_split):
                        if l_split[i] == '<bar>' or 'position_' in l_split[i]:
                            # already gone too far
                            i -= 1
                            break
                        l_chord_buffer.append( l_split[i] )
                        p_chord_buffer.append( p_split[i] )
                        i += 1
                    # check if type is the same
                    if set(l_chord_buffer).issubset( p_chord_buffer ) and tmp_correct_root:
                        correct_chord_predictions += 1
                    # reset buffers
                    l_chord_buffer = []
                    p_chord_buffer = []
            elif tokenizer_name == 'PitchClassTokenizer':
                if arm_root:
                    total_chord_predictions += 1
                    # progress to type
                    i += 1
                    while i < len(l_split):
                        if l_split[i] == '<bar>' or 'position_' in l_split[i]:
                            # already gone too far
                            i -= 1
                            break
                        l_chord_buffer.append( l_split[i] )
                        p_chord_buffer.append( p_split[i] )
                        i += 1
                    # check if type is the same
                    if set(l_chord_buffer).issubset( p_chord_buffer ):
                        correct_chord_predictions += 1
                    # reset buffers
                    l_chord_buffer = []
                    p_chord_buffer = []
            i += 1
    results = {
        'correct_bar_predictions': correct_bar_predictions/total_bars,
        'correct_new_chord_predictions': correct_new_chord_predictions/total_new_chord_predictions,
        'correct_position_predictions': correct_position_predictions/total_position_predictions,
        'correct_chord_predictions': correct_chord_predictions/total_chord_predictions,
        'correct_root_predictions': correct_root_predictions/total_chord_predictions
    }
    return results
# end symbol_basic_eval

In [3]:
# tokenizers = {
#     'ChordSymbolTokenizer': symbol_basic_eval,
#     'RootTypeTokenizer': symbol_basic_eval,
#     'PitchClassTokenizer': symbol_basic_eval,
#     'RootPCTokenizer': symbol_basic_eval,
#     'GCTRootPCTokenizer': symbol_basic_eval,
#     'GCTSymbolTokenizer': symbol_basic_eval,
#     'GCTRootTypeTokenizer': symbol_basic_eval
# }

tokenizers = {
    'ChordSymbolTokenizer': symbol_basic_eval,
    'RootTypeTokenizer': symbol_basic_eval,
    'PitchClassTokenizer': symbol_basic_eval,
    'RootPCTokenizer': symbol_basic_eval,
    # 'GCTRootPCTokenizer': symbol_basic_eval,
    # 'GCTSymbolTokenizer': symbol_basic_eval,
    # 'GCTRootTypeTokenizer': symbol_basic_eval
}

tokenized_folder = 'tokenized/gen/'

results = {}

In [4]:
for tokenizer_name in tokenizers.keys():
    if tokenizers[tokenizer_name] is not None:
        df = pd.read_csv( tokenized_folder + tokenizer_name + '.csv' )
        results[tokenizer_name] = tokenizers[tokenizer_name](df, tokenizer_name=tokenizer_name)

In [5]:
print(results['ChordSymbolTokenizer'])
# print(results['GCTSymbolTokenizer'])
print(results['RootTypeTokenizer'])
# print(results['GCTRootTypeTokenizer'])
print(results['RootPCTokenizer'])
# print(results['GCTRootPCTokenizer'])
print(results['PitchClassTokenizer'])

{'correct_bar_predictions': 0.9309529632346947, 'correct_new_chord_predictions': 0.9066162751758121, 'correct_position_predictions': 0.8530354494570157, 'correct_chord_predictions': 0.5773812371429938, 'correct_root_predictions': 0.6157489355594891}
{'correct_bar_predictions': 0.9272845846580255, 'correct_new_chord_predictions': 0.9175238004114242, 'correct_position_predictions': 0.8596373726259389, 'correct_chord_predictions': 0.554756908790571, 'correct_root_predictions': 0.5951604982365284}
{'correct_bar_predictions': 0.9038884812912693, 'correct_new_chord_predictions': 0.9021671530402334, 'correct_position_predictions': 0.8523656891355308, 'correct_chord_predictions': 0.5971870066497632, 'correct_root_predictions': 0.6479931110366933}
{'correct_bar_predictions': 0.9197847884568354, 'correct_new_chord_predictions': 0.8997751518920729, 'correct_position_predictions': 0.8514567286992297, 'correct_chord_predictions': 0.5841266803808066, 'correct_root_predictions': 0.0}


In [6]:
print(results)
with open('results/basic_eval_result.json', 'w') as fp:
    json.dump(results, fp)

{'ChordSymbolTokenizer': {'correct_bar_predictions': 0.9309529632346947, 'correct_new_chord_predictions': 0.9066162751758121, 'correct_position_predictions': 0.8530354494570157, 'correct_chord_predictions': 0.5773812371429938, 'correct_root_predictions': 0.6157489355594891}, 'RootTypeTokenizer': {'correct_bar_predictions': 0.9272845846580255, 'correct_new_chord_predictions': 0.9175238004114242, 'correct_position_predictions': 0.8596373726259389, 'correct_chord_predictions': 0.554756908790571, 'correct_root_predictions': 0.5951604982365284}, 'PitchClassTokenizer': {'correct_bar_predictions': 0.9197847884568354, 'correct_new_chord_predictions': 0.8997751518920729, 'correct_position_predictions': 0.8514567286992297, 'correct_chord_predictions': 0.5841266803808066, 'correct_root_predictions': 0.0}, 'RootPCTokenizer': {'correct_bar_predictions': 0.9038884812912693, 'correct_new_chord_predictions': 0.9021671530402334, 'correct_position_predictions': 0.8523656891355308, 'correct_chord_predict

In [7]:
tokenized_folder = 'tokenized/gen_reg/'

results = {}

for tokenizer_name in tokenizers.keys():
    if tokenizers[tokenizer_name] is not None:
        df = pd.read_csv( tokenized_folder + tokenizer_name + '.csv' )
        results[tokenizer_name] = tokenizers[tokenizer_name](df, tokenizer_name=tokenizer_name)

print(results)
with open('results/basic_eval_reg_result.json', 'w') as fp:
    json.dump(results, fp)

{'ChordSymbolTokenizer': {'correct_bar_predictions': 0.9357626151463275, 'correct_new_chord_predictions': 0.9121657178395446, 'correct_position_predictions': 0.8528440893651629, 'correct_chord_predictions': 0.5800602784289336, 'correct_root_predictions': 0.6184758168683921}, 'RootTypeTokenizer': {'correct_bar_predictions': 0.9335615880003261, 'correct_new_chord_predictions': 0.9156101994928958, 'correct_position_predictions': 0.8605463330622398, 'correct_chord_predictions': 0.5612304120719674, 'correct_root_predictions': 0.6004732354122951}, 'PitchClassTokenizer': {'correct_bar_predictions': 0.9103285236814217, 'correct_new_chord_predictions': 0.9058029947854375, 'correct_position_predictions': 0.8553796105822131, 'correct_chord_predictions': 0.5976175668564321, 'correct_root_predictions': 0.0}, 'RootPCTokenizer': {'correct_bar_predictions': 0.9241053232249123, 'correct_new_chord_predictions': 0.9010668325120796, 'correct_position_predictions': 0.8503085681481127, 'correct_chord_predic