In [7]:
from app.scoring import extract_pb_notes

names = [
    "C",
    "C#",
    "D",
    "D#",
    "E",
    "F",
    "F#",
    "G",
    "G#",
    "A",
    "A#",
    "B",
]


def pitch_name(midi_pitch: int) -> str:
    octave = midi_pitch // 12 - 1
    name = names[midi_pitch % 12]
    return f"{name}{octave}"


with open("../scores/gymnopedie.notelist", "rb") as f:
    notes_oemer = extract_pb_notes(f.read()).notes
with open("../scores/gymnopedia audio.notelist", "rb") as f:
    notes_transkun = extract_pb_notes(f.read()).notes
notes_transkun = list(filter(lambda note: note.pitch > 0, notes_transkun))

In [24]:
from time import time
import numpy as np
from numba import njit, int32, int8, float32


@njit
def precompute_cost(p1, p2):
    cost = np.abs(p1[:, None] - p2[None, :])
    m, n = cost.shape
    for i in range(m):
        for j in range(n):
            v = cost[i, j] - 1
            if v < 0:
                v = 0
            cost[i, j] = v
    return cost


# noinspection PyTypeHints,PyTypeChecker
@njit
def _align_numba(
    p1: np.ndarray,
    p2: np.ndarray,
):
    m, n = p1.shape[0], p2.shape[0]

    cost = precompute_cost(p1, p2)

    DIAG, DEL, INS, SWAP = 0, 1, 2, 3
    OP_COST = 5
    MOVE_SWAP_COST = 1

    # 3) DP tables
    dp = np.zeros((m + 1, n + 1), dtype=float32)
    back = np.zeros((m + 1, n + 1), dtype=int8)
    swap_len = np.zeros((m + 1, n + 1), dtype=int32)

    dp[1 : m + 1, 0] = np.arange(1, m + 1, dtype=float32)
    dp[0, 1 : n + 1] = np.arange(1, n + 1, dtype=float32)
    back[1 : m + 1, 0] = DEL
    back[0, 1 : n + 1] = INS

    # fill
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            best = dp[i - 1, j - 1] + cost[i - 1, j - 1]
            op = DIAG

            # deletion
            tmp = dp[i - 1, j] + OP_COST
            if tmp < best:
                best, op = tmp, DEL

            # insertion
            tmp = dp[i, j - 1] + OP_COST
            if tmp < best:
                best, op = tmp, INS

            # window for SWAP/MOVE
            max_swap = 7
            max_k = max_swap if max_swap < i and max_swap < j else min(i, j) - 1

            # SWAP: cross-match ends + interior zeros (same as your original)
            for k in range(1, max_k + 1):
                if cost[i - 1, j - k - 1] == 0.0 and cost[i - k - 1, j - 1] == 0.0:
                    ok = True
                    for x in range(k - 1):
                        if cost[i - k + x, j - k + x] != 0.0:
                            ok = False
                            break
                    if ok:
                        tmp = dp[i - k - 1, j - k - 1] + MOVE_SWAP_COST
                        if tmp < best:
                            best, op = tmp, SWAP
                            swap_len[i, j] = k
                        break

            dp[i, j] = best
            back[i, j] = op

    # backtrace
    alignment = []
    i, j = m, n
    while i > 0 or j > 0:
        op = back[i, j]
        if op == DIAG:
            alignment.append((i - 1, j - 1))
            i, j = i - 1, j - 1
        elif op == DEL:
            alignment.append((i - 1, None))
            i -= 1
        elif op == INS:
            alignment.append((None, j - 1))
            j -= 1
        elif op == SWAP:
            L = swap_len[i, j]
            for x in range(L + 1):
                alignment.append((i - 1 - x, j - L - 1 + x))
            i -= L + 1
            j -= L + 1

    # reverse in-place
    for a in range(len(alignment) // 2):
        alignment[a], alignment[-1 - a] = alignment[-1 - a], alignment[a]

    return dp[m, n], alignment


def align(seq1, seq2):
    start_time = time()

    m = len(seq1)
    n = len(seq2)
    p1 = np.empty(m, dtype=np.int32)
    p2 = np.empty(n, dtype=np.int32)
    for i, note in enumerate(seq1):
        p1[i] = note.pitch
    for j, note in enumerate(seq2):
        p2[j] = note.pitch

    print(f"Setup: {(time() - start_time) * 1e3:.1f}ms")

    start_time = time()
    result = _align_numba(p1, p2)
    print(f"DP & Backtrack: {(time() - start_time) * 1e3:.1f}ms")
    return result


def note_key(note):
    return note.page, round(note.start_time, 1), note.pitch


# Sort the lists based on start times and pitch.
notes_transkun.sort(key=note_key)
notes_oemer.sort(key=note_key)

# Wrap the sequences by extracting the pitch values.
min_cost, aligned_pairs = align(notes_oemer, notes_transkun)
print("Final alignment pair:", aligned_pairs[-1])
print("Number of notes in transkun:", len(notes_transkun))
print("Number of notes in oemer:", len(notes_oemer))

print("\nMin cost:", min_cost)

Setup: 0.4ms
DP & Backtrack: 620.7ms
Final alignment pair: (466, 450)
Number of notes in transkun: 451
Number of notes in oemer: 467

Min cost: 638.0


In [25]:
import cv2


def putText(img, text, pos, color):
    cv2.putText(img, text, pos, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)


note_names_list = ["C", "Db", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"]
page_size = [1418, 1836]


def name(note):
    if note is None:
        return "None"
    note = note.pitch
    return f"{note_names_list[note % 12]}{note // 12 - 1}"


def conditional_invert(img_bgr):
    cutoff_value = 0.05

    img = img_bgr.astype(np.float32) / 255.0
    rgb = img[..., ::-1]
    luma = 0.2126 * rgb[..., 0] + 0.7152 * rgb[..., 1] + 0.0722 * rgb[..., 2]
    mask = (luma < cutoff_value) | (luma > 1 - cutoff_value)
    mask = mask[..., None]  # broadcast over 3 channels

    rgb_out = np.where(mask, 1.0 - rgb, rgb)
    out_bgr = (rgb_out[..., ::-1] * 255).astype(np.uint8)
    return out_bgr


def compare(oemer_seq, transkun_seq, paired):
    correct_cnt = 0

    len_images = 8
    imgs = [None for _ in range(len_images)]
    for i in range(len_images):
        imgs[i] = cv2.imread(f"../scores/score-images/score_{i}.png")
        imgs[i] = cv2.resize(imgs[i], page_size)

    for i, (s1, s2) in enumerate(paired):
        if s1 is None or s1 > len(oemer_seq):
            continue

        oemer = oemer_seq[s1]
        p = oemer.page
        x1, y1, x2, y2 = oemer.bbox
        y_pos = y1 + round((y2 - y1))
        cv2.rectangle(imgs[p], (x1, y1), (x2, y2), (255, 0, 0), 2)

        if s2 is None or s2 > len(transkun_seq):
            putText(imgs[p], str(i), (x2 + 2, y_pos + 30), (255, 255, 128))
            putText(
                imgs[p],
                name(oemer),
                (x2 + 2, y_pos),
                (0, 0, 200),
            )
            putText(
                imgs[p],
                "None",
                (x2 + 2, y_pos + 15),
                (0, 200, 0),
            )
            continue

        transkun = transkun_seq[s2]

        is_correct = abs(transkun.pitch - oemer.pitch) < 2
        if not is_correct:
            putText(imgs[p], str(i), (x2 + 2, y_pos + 30), (255, 255, 128))
            putText(
                imgs[p],
                name(oemer),
                (x2 + 2, y_pos),
                (0, 0, 200),
            )
            putText(
                imgs[p],
                name(transkun),
                (x2 + 2, y_pos + 15),
                (0, 200, 0),
            )
        correct_cnt += is_correct

    for i in range(len_images):
        imgs[i] = conditional_invert(imgs[i])

    row1 = np.hstack(imgs[0:4])  # BGR
    row2 = np.hstack(imgs[4:8])
    mosaic = np.vstack([row1, row2])

    cv2.imwrite("comparison.png", mosaic)

    print(
        f"Match: {correct_cnt} / {len(paired)}\tAccuracy: {correct_cnt / len(paired) * 100:.2f}%"
    )


compare(notes_oemer, notes_transkun, aligned_pairs)

Match: 388 / 519	Accuracy: 74.76%


In [26]:
[pitch_name(n.pitch) for n in notes_transkun]

['G2',
 'B3',
 'D4',
 'F#4',
 'D2',
 'A3',
 'C#4',
 'F#4',
 'G2',
 'B3',
 'D4',
 'F#4',
 'D2',
 'A3',
 'C#4',
 'F#4',
 'G2',
 'B3',
 'D4',
 'F#4',
 'F#5',
 'A5',
 'D2',
 'G5',
 'A3',
 'C#4',
 'F#4',
 'F#5',
 'C#5',
 'G2',
 'B4',
 'B3',
 'D4',
 'F#4',
 'C#5',
 'D5',
 'A4',
 'D2',
 'A3',
 'C#4',
 'F#4',
 'G2',
 'F#4',
 'B3',
 'D4',
 'F#4',
 'D2',
 'A3',
 'C#4',
 'F#4',
 'G2',
 'B3',
 'D4',
 'F#4',
 'D2',
 'A3',
 'C#4',
 'F#4',
 'G2',
 'B3',
 'D4',
 'F#4',
 'F#5',
 'A5',
 'D2',
 'G5',
 'A3',
 'C#4',
 'F#4',
 'F#5',
 'C#5',
 'G2',
 'B4',
 'B3',
 'D4',
 'F#4',
 'C#5',
 'D5',
 'D2',
 'A4',
 'A3',
 'C#4',
 'F#4',
 'F#2',
 'C#5',
 'A3',
 'C#4',
 'F#4',
 'B1',
 'F#5',
 'B3',
 'D4',
 'F#4',
 'E2',
 'E4',
 'G3',
 'B3',
 'E2',
 'B3',
 'D4',
 'G4',
 'D2',
 'F3',
 'A3',
 'D4',
 'A1',
 'A4',
 'A3',
 'C4',
 'E4',
 'B4',
 'C5',
 'E5',
 'G3',
 'B3',
 'E4',
 'D5',
 'B4',
 'D2',
 'D5',
 'D3',
 'G3',
 'B3',
 'E4',
 'C5',
 'B4',
 'D2',
 'D5',
 'C3',
 'E3',
 'A3',
 'D4',
 'D2',
 'C3',
 'F#3',
 'A3',
 'D4',
 

In [1]:
from app.scoring import *

with open("../scores/spider dance oemer.notelist", "rb") as f:
    oemer_data = extract_pb_notes(f.read())

note_list = NoteList()
note_list.lines.extend(oemer_data.lines)
note_list.voices.extend(oemer_data.voices)
note_list.size.extend(page_size)

for i, j in aligned_pairs:
    if i is None or j is None:
        continue
    note_o = notes_oemer[i]
    note_t = notes_transkun[j]
    note_list.notes.append(
        Note(
            pitch=note_t.pitch,
            start_time=note_t.start_time,
            duration=note_t.duration,
            velocity=note_t.velocity,
            page=note_o.page,
            track=note_o.track,
            bbox=note_o.bbox,
        )
    )


with open("../scores/gymnopedie combined.scoredata", "wb") as f:
    f.write(note_list.SerializeToString())

print("Length:", len(note_list.notes))
print("Size:", note_list.size)
print("Voices size:", len(note_list.voices))
print("Lines size:", len(note_list.lines))

  import pkg_resources
