In [1]:
import re
from collections import Counter
from typing import Dict, List


_BAR_TOKENS = ["||", "|]", "|:", ":|", "::", "|"]
_ACCIDENTALS = ["^^", "__", "^", "_", "="]
_RESTS = {"z", "Z", "x"}
_NOTE_LETTERS = set("ABCDEFGabcdefg")
_HDR_RE = re.compile(r'^(?:K|M|L):[^\n]*\n', re.MULTILINE)
_INLINE_HDR_RE = re.compile(r'\[(?:K|M|L):[^\]]+\]') # [K:C], [M:3/4] ...
_TUPLET_RE = re.compile(r"\(\d+")                    # (3   (5   etc.
_CHORD_RE = re.compile(
    r'^"'
    r'(?P<root>[A-G](?:b|#)?)'          # root note
    r'(?P<body>[0-9A-Za-z+\-#]*)'       # quality / extensions
    r'(?:/(?P<bass>[A-G](?:b|#)?))?'    # optional slash bass
    r'"$'
)
_SKIP_Q_RE = re.compile(r'\[Q:[^\]]+\]')
_DENOM_POW2 = r'(?:2|4|8|16)' # power-of-two denominators up to 16
_DUR_RE = re.compile(
    rf'''
    (?:[1-9]\d?/{_DENOM_POW2})  |  # 3/2  7/8  12/16
    (?:/)  |      # / 
    (?:[2-8]|16)  # 2-8 or 16
    ''',
    re.VERBOSE
)
_SQUARE_BRACKET = re.compile(r"\[[^\]]+\]")

def build_token_type_counter(tunes: List[str], tokenizer, re) -> Counter:
    """Tokenise every tune once and count tokens of re's type."""
    counter = Counter()
    for t in tunes:
        for tok in tokenizer.tokenize_abc(t):
            if re.fullmatch(tok):
                counter[tok.strip()] += 1
    return counter

def build_chord_counter(tunes: List[str], tokenizer, re) -> Counter:
    """Tokenise every tune once and count tokens of re's type."""
    counter = Counter()
    for t in tunes:
        for tok in tokenizer.tokenize_abc(t):
            if re.fullmatch(tok):
                counter[tok] += 1
    return counter



def _strip_slash(ch: str) -> str:
    return re.sub(r'/[A-G](?:b|#)?(?=")', "", ch)


def _to_seventh(ch: str, m) -> str:
    """root → root7   or   rootm7   (keep minor flag)"""
    root, body = m.group("root"), m.group("body")
    if body.startswith(("m", "min")) and not body.startswith("maj"):
        return f'"{root}m7"'
    return f'"{root}7"'


def _to_triad(ch: str, m) -> str:
    root, body = m.group("root"), m.group("body")
    if body.startswith(("m", "min")) and not body.startswith("maj"):
        return f'"{root}m"'
    if body.startswith(("dim", "o")):
        return f'"{root}dim"'
    if body.startswith(("aug", "+")):
        return f'"{root}aug"'
    return f'"{root}"' # major / no quality


def make_chord_map(counter: Counter, min_count: int) -> Dict[str, str]:
    """
    Build {rare_chord: mapped_chord} using the layered fallback:
      1. keep if common (count ≥ min_count)
      2. drop slash bass
      3. collapse to 7-chord
      4. collapse to triad (major/minor/dim/aug)
      5. root only
      6. UNK_CHORD
    """
    high_freq = {c for c, n in counter.items() if n >= min_count}
    chord_map: Dict[str, str] = {}
    for chord, cnt in counter.items():
        if cnt >= min_count: # keep as-is
            chord_map[chord] = chord
            continue

        m = _CHORD_RE.fullmatch(chord)
        if not m: # shouldn’t happen
            chord_map[chord] = '"UNK_CHORD"'
            continue

        # 1- strip slash bass
        c1 = _strip_slash(chord) # "D7/F#" → "D7"
        if c1 in high_freq:
            chord_map[chord] = c1; continue

        # 2- collapse to seventh
        c2 = _to_seventh(chord, m) # "Em9" → "Em7"
        if c2 in high_freq:
            chord_map[chord] = c2; continue

        # 3- collapse to triad quality
        c3 = _to_triad(chord, m) # "Cm11b13" → "Cm"
        if c3 in high_freq:
            chord_map[chord] = c3; continue

        # 4- root only
        root_only = f'"{m.group("root")}"' # "F#m" → "F#"
        if root_only in high_freq:
            chord_map[chord] = root_only; continue

        # 5- unknown
        chord_map[chord] = '"UNK_CHORD"'

    # make sure the placeholder itself is mapped to itself
    chord_map['"UNK_CHORD"'] = '"UNK_CHORD"'
    return chord_map


def make_header_map(counter: Counter, min_key: int = 100, min_meter: int = 100) -> Dict[str, str]:
    """
    Collapse rare K:/M: header *lines* to K:RARE\n, M:RARE\n
    Returns {rare_header_line: mapped_header_line}.
    """
    hdr_map: Dict[str, str] = {}
    for hdr, n in counter.items():
        if hdr.startswith("K:"):
            if n >= min_key:
                hdr_map[hdr] = hdr 
            else:
                hdr_map[hdr] = "K:RARE\n"
        elif hdr.startswith("M:"):
            if n >= min_meter:
                hdr_map[hdr] = hdr
            else:
                hdr_map[hdr] =  "M:RARE\n" 

    # ensure placeholders map to themselves so they stay printable
    hdr_map.update({"K:RARE\n":"K:RARE\n",
                    "M:RARE\n":"M:RARE\n"})
    return hdr_map


def inline_from_header_map(header_map: Dict[str, str]) -> Dict[str, str]:
    """
    Derive {inline_src: inline_tgt} from the full-line header_map.
    full-line tokens look like  'K:D\n'     (newline included)
    inline tokens should look like '[K:D]'  (no newline)
    """
    inline_map = {}
    for full, mapped in header_map.items():
        # strip the trailing newline from both src and dst
        src_body = full.rstrip("\n")
        dst_body = mapped.rstrip("\n")
        inline_map[f"[{src_body}]"] = f"[{dst_body}]"

    # ensure the placeholders map to themselves
    for tag in ("K:RARE", "M:RARE"):
        inline_map[f"[{tag}]"] = f"[{tag}]"

    return inline_map


In [2]:
from tokenizer import ABCTokenizer
import pandas as pd
import os

# Read raw ABC file
with open("leadsheets.abc", "r") as f:
    raw_data = f.read()

tunes = raw_data.strip().split("\n\n")
raw_tok = ABCTokenizer()

chord_counts = build_chord_counter(tunes, raw_tok, _CHORD_RE)
hdr_counts = build_token_type_counter(tunes, raw_tok,  _HDR_RE)
meter_counts = {k: v for k, v in hdr_counts.items() if k.startswith('M:')}
key_counts   = {k: v for k, v in hdr_counts.items() if k.startswith('K:')}
length_counts= {k: v for k, v in hdr_counts.items() if k.startswith('L:')}

chord_df  = (pd.DataFrame(chord_counts.items(), columns=["chord", "count"]).sort_values("count", ascending=False))
meter_df  = (pd.DataFrame(meter_counts.items(), columns=["meter", "count"]).sort_values("count", ascending=False))
key_df  = (pd.DataFrame(key_counts.items(), columns=["key", "count"]).sort_values("count", ascending=False))
length_df  = (pd.DataFrame(length_counts.items(), columns=["default_note_length", "count"]).sort_values("count", ascending=False))

os.makedirs("token_frequencies", exist_ok=True)
chord_df.to_csv("token_frequencies/chord_frequencies.csv",  index=False)
meter_df.to_csv("token_frequencies/meter_frequencies.csv",  index=False)
key_df.to_csv("token_frequencies/key_frequencies.csv",  index=False)
length_df.to_csv("token_frequencies/length_frequencies.csv",  index=False)

Based on the counts from the csv files, we determine the min_count for each category. That is, what is the minimum number of occurences which warrants a unique token. Everything that's smaller than min_count should be lumped into a "RARE" token. This is because such tokens will not have the opportunity to be properly learnt in training due to their infrequency. So, rather than polluting the vocabulary, we will either count them as "RARE" or even, when it makes sense (like in the case of chords), "simplify" them to popular tokens. Below we will cache this mapping for convenience of use in other notebooks.

In [3]:
import json, os

chord_map    = make_chord_map(chord_counts, min_count=200)
header_map   = make_header_map(hdr_counts, min_key=100, min_meter=100)
inline_map   = inline_from_header_map(header_map)

maps = {
    "chord_map":  chord_map,
    "header_map": header_map,
    "inline_map": inline_map,
}

os.makedirs("cache", exist_ok=True)
with open("cache/abc_maps.json", "w", encoding="utf-8") as f:
    json.dump(maps, f, ensure_ascii=False, indent=2)

print("Maps cached to cache/abc_maps.json")

Maps cached to cache/abc_maps.json


Here's what our final vocabulary looks like (all of the tokens grouped by category)

In [4]:
tokenizer = ABCTokenizer(chord_map=chord_map, header_map=header_map, inline_hdr_map=inline_map)
tokenizer.build_vocab(tunes)

print("FINAL VOCABULARY")
print("---------------")
print("vocab size →", tokenizer.vocab_size())
tokenizer.print_grouped_tokens(tokenizer.stoi)

FINAL VOCABULARY
---------------
vocab size → 215
Accidental     : = ^ ^^ _ __
Articulation   : - .
Bar/Repeat     : :: :| | |: |] ||
Broken         : < >
Chord          : "A" "A/C#" "A/E" "A7" "Ab" "Ab/Eb" "Ab7" "Am" "Am/C" "Am7" "B" "B7" "Bb" "Bb/F" "Bb7" "Bbm" "Bm" "Bm7" "C" "C#" "C#7" "C#m" "C/E" "C/G" "C7" "Cm" "D" "D/A" "D/F#" "D7" "Db" "Dm" "Dm/F" "Dm7" "E" "E7" "Eb" "Eb/Bb" "Eb7" "Em" "Em7" "F" "F#" "F#/C#" "F#7" "F#m" "F/C" "F7" "Fm" "G" "G#m" "G/B" "G/D" "G7" "Gb" "Gm" "Gm/Bb" "Gm7"
Chord-like melodic groupings: [ ]
Duration       : / 16 2 27/8 3 3/2 3/4 3/8 4 5 5/2 6 7 7/2 7/4 7/8 8 9/2 9/4
Grace Notes    : { }
Header         : 'K:RARE\n' 'L:1/16\n' 'L:1/4\n' 'L:1/8\n' 'M:RARE\n'
Inline Header Change: '[K:A#min]' '[K:A]' '[K:Ab]' '[K:Abmin]' '[K:Ador]' '[K:Amin]' '[K:Amix]' '[K:Bb]' '[K:Bblyd]' '[K:Bmin]' '[K:C#]' '[K:C#min]' '[K:C]' '[K:Cb]' '[K:Clyd]' '[K:Cmin]' '[K:D#min]' '[K:D]' '[K:Db]' '[K:Ddor]' '[K:Dmin]' '[K:Dmix]' '[K:E]' '[K:Eb]' '[K:Edor]' '[K:Emin]' '[K:F#]' '[

Here's an example of how a song would be tokenized (what the model will "see" vs. what the original song looks like)

In [5]:
TUNE_NUMBER = 3
print("ORIGINAL SONG:")
print(tunes[TUNE_NUMBER])
print()
print("WHAT THE MODEL SEES:")
print(tokenizer.decode(tokenizer.encode(tunes[TUNE_NUMBER])))

ORIGINAL SONG:
X:20
L:1/4
M:4/4
K:Emin
"^*""Em" E2"D" D2 |"C" E2"B7" B,2 |"^*""Em" E E"D" F F |"G" G A/G/"B7" F2 |"^*""Em" B B"D" A A | 
"C" G A/G/"B7" F B, |"^*""Em" E2"D" D F |"^Last: E""Em" E4 |]

WHAT THE MODEL SEES:
L:1/4
M:4/4K:Emin"Em" E2"D" D2 |"C" E2"B7" B,2 |"Em" E E"D" F F |"G" G A/G/"B7" F2 |"Em" B B"D" A A | 
"C" G A/G/"B7" F B, |"Em" E2"D" D F |"Em" E4 |]
