#โหลด datasets

In [None]:
!wget https://huggingface.co/datasets/ailsntua/Chordonomicon/resolve/main/chordonomicon_v2.csv

--2025-06-05 04:55:53--  https://huggingface.co/datasets/ailsntua/Chordonomicon/resolve/main/chordonomicon_v2.csv
Resolving huggingface.co (huggingface.co)... 3.166.152.65, 3.166.152.105, 3.166.152.110, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.65|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/32/19/3219792b72b684216fa25f158f3ef1a4e35d7672f255c70d6f5d5dc6705c53a4/9d2f4ccdc876a4e816712f128e4772b2af558c2a2923cce5be3a0364fbad9a8d?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27chordonomicon_v2.csv%3B+filename%3D%22chordonomicon_v2.csv%22%3B&response-content-type=text%2Fcsv&Expires=1749102953&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0OTEwMjk1M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzMyLzE5LzMyMTk3OTJiNzJiNjg0MjE2ZmEyNWYxNThmM2VmMWE0ZTM1ZDc2NzJmMjU1YzcwZDZmNWQ1ZGM2NzA1YzUzYTQvOWQyZjRjY2RjODc2YTRlODE2NzEyZjEyOGU0NzcyYjJhZjU1

# เตรียม datasets

In [None]:
import pandas as pd, re, random, math, gzip, pickle
from collections import Counter, defaultdict
from itertools import pairwise

CSV_PATH = "chordonomicon_v2.csv"
MAX_LEN  = 128
ADD_EOS  = True
random.seed(42)

In [None]:
TAG_REGEX = re.compile(r"<[^>]+>")
SEP_REGEX = re.compile(r"[|,\n]")

def to_tokens(raw: str):
    if pd.isna(raw):
        return []

    s      = TAG_REGEX.sub(" ", raw)        # ลบ <tag>
    s      = SEP_REGEX.sub(" ", s)          # แทนคั่นเป็นช่องว่าง
    s      = re.sub(r"\s+", " ", s).strip()
    tokens = s.split(" ")[:MAX_LEN]         # clip ยาวเกิน

    if ADD_EOS:
        if not tokens or tokens[-1] != "<eos>":
            tokens.append("<eos>")
    return tokens

In [None]:
df   = pd.read_csv(CSV_PATH, usecols=["chords"])
seqs = [to_tokens(r) for r in df["chords"] if to_tokens(r)]
random.shuffle(seqs)

n = len(seqs)
train, val, test = seqs[:int(.8*n)], seqs[int(.8*n):int(.9*n)], seqs[int(.9*n):]

vocab   = sorted({t for s in train for t in s} | {"<unk>"})
V       = len(vocab)
uni_cnt = Counter(t for s in train for t in s)   # ใช้ซ้ำทุก k
total_u = sum(uni_cnt.values())

print(f"raw: {n}, train: {len(train)}, valid: {len(val)}, test: {len(test)}, vocab: {V}")

raw: 679807, train: 543845, valid: 67981, test: 67981, vocab: 4031


# สร้าง Bigram

In [None]:
def ppl_with_k(k):
    bi = defaultdict(Counter)
    for seq in train:
        for w1, w2 in pairwise(seq): bi[w1][w2] += 1

    def bigram_prob(w1, w2):
        c_w1 = sum(bi[w1].values()) if w1 in bi else 0
        return (bi[w1][w2] + k) / (c_w1 + k*V)

    # --- PPL บน validation ---
    log_sum, N = 0.0, 0
    for seq in val:
        for w1, w2 in pairwise(seq):
            if w2 not in vocab: w2 = "<unk>"
            p = bigram_prob(w1, w2) if w1 in bi else (uni_cnt[w2]+k)/(total_u+k*V)
            log_sum += math.log(p); N += 1
    return math.exp(-log_sum / N), bi   # คืนทั้ง PPL และ bigram counts

In [None]:
candidates = [0.25, 0.5, 1, 1.5, 2]
best_k, best_ppl, best_bi = None, float('inf'), None
for k in candidates:
    ppl, bi = ppl_with_k(k)
    print(f"k={k:<4}  PPL(val)={ppl:.2f}")
    if ppl < best_ppl: best_k, best_ppl, best_bi = k, ppl, bi

print(f"\n>>> เลือก k={best_k} (PPL(val)={best_ppl:.2f})")

bi_cnt = best_bi
SMOOTH_K = best_k

k=0.25  PPL(val)=16.56
k=0.5   PPL(val)=16.68
k=1     PPL(val)=16.87
k=1.5   PPL(val)=17.03
k=2     PPL(val)=17.17

>>> เลือก k=0.25 (PPL(val)=16.56)


In [None]:
def p_w2_given_w1(w1, w2):
    c_w1 = sum(bi_cnt[w1].values()) if w1 in bi_cnt else 0
    return (bi_cnt[w1][w2] + SMOOTH_K) / (c_w1 + SMOOTH_K*V)

def sample_next(w1):
    if w1 not in bi_cnt: w1 = "<unk>"
    if "_cached" not in bi_cnt[w1]:
        toks, csum, run = [], [], 0.0
        for w2, c in bi_cnt[w1].items():
            run += (c + SMOOTH_K)/(sum(bi_cnt[w1].values()) + SMOOTH_K*V)
            toks.append(w2); csum.append(run)
        bi_cnt[w1]["_cached"] = (toks, csum)
    toks, csum = bi_cnt[w1]["_cached"]
    r = random.random()
    for tok, cum in zip(toks, csum):
        if r <= cum: return tok
    return random.choice(vocab)

In [None]:
def continue_chords(seed, predict_len=8):
    out = list(seed)
    while len(out) < len(seed)+predict_len:
        prev = out[-1] if out[-1] in vocab else "<unk>"
        nxt  = sample_next(prev)
        if nxt == "<eos>": break
        out.append(nxt)
    return out

# bigram
*   PPL: 16.68
*   TOP-1 acc: 0.272941349621017
*   TOP-3 acc: 0.5482081095003312
*   TOP-5 acc: 0.7010817573037015
*   TOP-8 acc: 0.8071234086393406



In [None]:
def perplexity(ds):
    log_sum, N = 0.0, 0
    for seq in ds:
        for w1, w2 in pairwise(seq):
            if w2 not in vocab: w2 = "<unk>"
            p = p_w2_given_w1(w1, w2) if w1 in bi_cnt else (uni_cnt[w2]+SMOOTH_K)/(total_u+SMOOTH_K*V)
            log_sum += math.log(p); N += 1
    return math.exp(-log_sum/N)

print(f"PPL(test) = {perplexity(test):.2f}")

PPL(test) = 16.64


In [None]:
def top_k_accuracy(dataset, k=3, context_len=4):
    hit, total = 0, 0
    for seq in dataset:
        if len(seq) <= context_len:
            continue

        prev      = seq[context_len - 1]
        true_next = seq[context_len]

        if prev not in bi_cnt:
            prev = "<unk>"

        # --- กรองออกเฉพาะ entry ที่เป็นตัวเลข (count) ---
        numeric_items = [(w2, c) for w2, c in bi_cnt[prev].items()
                         if isinstance(c, (int, float))]

        # เผื่อ prev เจอแต่ cache (กรณีหายาก) → ข้ามไป
        if not numeric_items:
            continue

        ranked = sorted(numeric_items,
                        key=lambda x: -(x[1] + SMOOTH_K))

        top_k = [w for w, _ in ranked[:k]]

        if true_next in top_k:
            hit += 1
        total += 1

    return hit / total if total else 0.0

In [None]:
print("Top‑1 acc (test):", top_k_accuracy(test, k=1))
print("Top‑3 acc (test):", top_k_accuracy(test, k=3))
print("Top‑5 acc (test):", top_k_accuracy(test, k=5))
print("Top‑8 acc (test):", top_k_accuracy(test, k=8))

Top‑1 acc (test): 0.272941349621017
Top‑3 acc (test): 0.5482081095003312
Top‑5 acc (test): 0.7010817573037015
Top‑8 acc (test): 0.8071234086393406


# Evaluation


In [None]:
ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {"C":0,"C#":1,"Db":1,"D":2,"D#":3,"Eb":3,"E":4,"F":5,"F#":6,"Gb":6,
           "G":7,"G#":8,"Ab":8,"A":9,"A#":10,"Bb":10,"B":11,"Cb":11}

def parse_root(tok:str)->int|None:
    m = ROOT_RE.match(tok.strip())
    if not m: return None
    note = m.group(1).upper() + m.group(2).replace('♯','#').replace('♭','b')
    return PC_NUM.get(note)

def circ_dist(a:int,b:int)->int:
    diff = abs(a-b)
    return min(diff, 12-diff)

def avg_step_distance(tokens:list[str])->float|None:
    s, n = 0, 0
    for a, b in zip(tokens[:-1], tokens[1:]):
        ra, rb = parse_root(a), parse_root(b)
        if ra is not None and rb is not None:
            s += circ_dist(ra, rb); n += 1
    return s / n if n else None

SEED_LEN   = 4
GEN_LEN    = 32
pred_sum = pred_cnt = 0

for seq in test:
    if len(seq) < SEED_LEN:
        continue

    seed_tokens = seq[:SEED_LEN]
    pred_tokens = continue_chords(seed_tokens, GEN_LEN)

    d_pred = avg_step_distance(pred_tokens)
    if d_pred is not None:
        pred_sum += d_pred
        pred_cnt += 1

avg_pred_step = pred_sum / pred_cnt
print(f"◾ Bigram predicted progression\n   average root-to-root distance = {avg_pred_step:.3f} semitones")

◾ Bigram predicted progression
   average root-to-root distance = 3.511 semitones


In [None]:
SEED_LEN = 4
GEN_LEN  = 32

from collections import Counter
pred_counter = Counter()
pred_total   = 0

for seq in test:
    if len(seq) < SEED_LEN:
        continue

    seed_tokens = seq[:SEED_LEN]
    gen_tokens  = continue_chords(seed_tokens, GEN_LEN)

    # ตัด <eos> ออกถ้ามี แล้วสะสม
    for tok in gen_tokens:
        if tok != "<eos>":
            pred_counter[tok] += 1
            pred_total       += 1


import math

def entropy(counter:Counter, total:int) -> float:
    """ Shannon entropy (base-2) ของฮิสโทแกรมคอร์ด """
    return -sum((c/total)*math.log2(c/total)
                for c in counter.values() if c)

che_pred = entropy(pred_counter, pred_total)
print(f"Chord-Histogram Entropy (Bigram predictions) : {che_pred:.3f} bits")

Chord-Histogram Entropy (Bigram predictions) : 5.783 bits


In [None]:
import re, math
import music21 as m21
ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {'C':0,'C#':1,'Db':1,'D':2,'D#':3,'Eb':3,'E':4,'F':5,'F#':6,'Gb':6,
           'G':7,'G#':8,'Ab':8,'A':9,'A#':10,'Bb':10,'B':11,'Cb':11}

def simplify(tok:str)->str|None:
    """เหลือ root + quality หลัก ('', m, dim) เพื่อเช็กกับคีย์"""
    m = ROOT_RE.match(tok)
    if not m: return None
    root, acc, qual = m.groups()
    root = root.upper() + acc.replace('♯','#').replace('♭','b')
    q = ('m' if qual.lower().startswith(('m','min')) else
         'dim' if ('dim' in qual.lower() or 'o' in qual) else '')
    return root + q

def diatonic_set(key_root:str, mode:str) -> set[str]:
    """คืนเซ็ตคอร์ดไดอะโทนิก root+qual ('C','Dm','Edim',…)"""
    major_map = {1:'',2:'m',3:'m',4:'',5:'',6:'m',7:'dim'}
    minor_map = {1:'m',2:'dim',3:'',4:'m',5:'m',6:'',7:''}
    mapping   = major_map if mode=='major' else minor_map

    scale = (m21.scale.MajorScale if mode=='major'
             else m21.scale.MinorScale)(key_root)

    diat = set()
    for deg,q in mapping.items():
        # music21 จะให้ชื่อโน้ตถูกต้อง (มี #/b เมื่อจำเป็น)
        note = scale.pitchFromDegree(deg).name.replace('-', 'b')
        diat.add(note + q)
    return diat

def guess_key(chords:list[str])->tuple[str,str]:
    """เดาคีย์หยาบ: root ที่เจอบ่อยสุด + mode major/minor"""
    roots = [c[0].upper() for c in chords if c]
    root  = max(set(roots), key=roots.count) if roots else 'C'
    mode  = 'minor' if any(c.lower().endswith(('m','min')) for c in chords) else 'major'
    return root, mode


SEED_LEN = 4
GEN_LEN  = 32

kc_in, kc_total = 0, 0

for seq in test:
    if len(seq) < SEED_LEN:
        continue

    seed = seq[:SEED_LEN]
    pred = continue_chords(seed, GEN_LEN)

    key_root, key_mode = guess_key(pred)
    diat_set = diatonic_set(key_root, key_mode)

    for tok in pred:
        simp = simplify(tok)
        if simp is None:
            continue
        kc_total += 1
        if simp in diat_set:
            kc_in += 1

kc_pred = kc_in / kc_total if kc_total else 0.0
print(f"Key-Consistency (Bigram predictions) : {kc_pred:.2%}")

Key-Consistency (Bigram predictions) : 31.05%


In [None]:
import re, numpy as np, music21 as m21
from functools import lru_cache

ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {'C':0,'C#':1,'Db':1,'D':2,'D#':3,'Eb':3,'E':4,'F':5,'F#':6,'Gb':6,
           'G':7,'G#':8,'Ab':8,'A':9,'A#':10,'Bb':10,'B':11,'Cb':11}
circ_dist = lambda a,b: min(abs(a-b), 12-abs(a-b))

@lru_cache(maxsize=4096)
def chord_vec(tok:str)->np.ndarray:
    """เวกเตอร์ pitch-class เต็ม (12-dim)"""
    pcs=set()
    try:
        pcs={p.pitchClass for p in m21.harmony.ChordSymbol(tok).pitches}
    except Exception:                                  # fallback triadง่ายๆ
        m=ROOT_RE.match(tok); pcs={0} if not m else {
            (r:=PC_NUM[m.group(1).upper()+m.group(2).replace('♯','#').replace('♭','b')]),
            (r+7)%12, (r+3)%12 if 'm' in tok.lower() else (r+4)%12}
    v=np.zeros(12); v[list(pcs)]=1
    return v/len(pcs)

def cloud_diam(pcs:set[int])->int:
    return 0 if len(pcs)<2 else max(circ_dist(a,b) for a in pcs for b in pcs)

def quick_key(chords:list[str])->np.ndarray:
    roots=[c[0].upper() for c in chords if c]
    root=max(set(roots), key=roots.count) if roots else 'C'
    mode='minor' if any(c.lower().endswith(('m','min')) for c in chords) else 'major'
    return chord_vec(root + ('m' if mode=='minor' else ''))

def bucket():
    return dict(cd=0, cm=0, ts=0, n_cd=0, n_cm=0, n_ts=0)

def update(tokens:list[str], acc:dict):
    tonic = quick_key(tokens)
    prev  = None
    for t in tokens:
        v   = chord_vec(t)
        pcs = np.where(v>0)[0]
        acc["cd"]  += cloud_diam(set(pcs)); acc["n_cd"] += 1
        acc["ts"]  += np.linalg.norm(v-tonic); acc["n_ts"] += 1
        if prev is not None:
            acc["cm"] += np.linalg.norm(v-prev); acc["n_cm"] += 1
        prev = v

SEED_LEN, GEN_LEN = 4, 32
metrics = bucket()

for seq in test:
    if len(seq) < SEED_LEN:
        continue
    seed = seq[:SEED_LEN]
    pred = continue_chords(seed, GEN_LEN)
    update(pred, metrics)

avg = lambda key: metrics[key]/metrics[f"n_{key}"] if metrics[f"n_{key}"] else 0
print(f"{'Metric':6} | Bigram-Predictions")
print("-"*32)
print(f"CD     | {avg('cd'):.2f}")
print(f"CM     | {avg('cm'):.2f}")
print(f"TS     | {avg('ts'):.2f}")

Metric | Bigram-Predictions
--------------------------------
CD     | 5.04
CM     | 0.66
TS     | 0.62


# Demo
เดา 8 คอร์ดถัดไป

In [None]:
def continue_chords(seed, predict_len=8):
    out = list(seed)
    while len(out) < len(seed)+predict_len:
        prev = out[-1] if out[-1] in vocab else "<unk>"
        nxt  = sample_next(prev)
        if nxt == "<eos>": break
        out.append(nxt)
    return out

demo = ["C", "D", "G7"]
print("Seed :", " ".join(demo))
print("→    :", " ".join(continue_chords(demo, 8)))

Seed : C D G7
→    : C D G7 C D Emin D Amin7 D9 F Dmin
