# Download datasets

In [None]:
!wget https://huggingface.co/datasets/ailsntua/Chordonomicon/resolve/main/chordonomicon_v2.csv

--2025-06-04 13:52:25--  https://huggingface.co/datasets/ailsntua/Chordonomicon/resolve/main/chordonomicon_v2.csv
Resolving huggingface.co (huggingface.co)... 13.35.202.97, 13.35.202.40, 13.35.202.121, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.97|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/32/19/3219792b72b684216fa25f158f3ef1a4e35d7672f255c70d6f5d5dc6705c53a4/9d2f4ccdc876a4e816712f128e4772b2af558c2a2923cce5be3a0364fbad9a8d?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27chordonomicon_v2.csv%3B+filename%3D%22chordonomicon_v2.csv%22%3B&response-content-type=text%2Fcsv&Expires=1749048745&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0OTA0ODc0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzMyLzE5LzMyMTk3OTJiNzJiNjg0MjE2ZmEyNWYxNThmM2VmMWE0ZTM1ZDc2NzJmMjU1YzcwZDZmNWQ1ZGM2NzA1YzUzYTQvOWQyZjRjY2RjODc2YTRlODE2NzEyZjEyOGU0NzcyYjJhZjU1O

# Dataloader



In [None]:
import os, re, math, json, random
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import (
    PreTrainedTokenizerFast, AutoModelForCausalLM,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)

random.seed(42)
torch.manual_seed(42)

TAG_REGEX = re.compile(r"<[^>]+>")

def split_phrases(line: str):
    if pd.isna(line): return []
    return [p.strip() for p in TAG_REGEX.split(line) if p.strip()]

def clean_tokens(phrase: str):
    cleaned = re.sub(r"[|,\n]+", " ", phrase)
    return re.sub(r"\s+", " ", cleaned).strip().split(" ")

In [None]:
class ChordDataset(Dataset):
    """CSV ('chords' column) → list[int] (token‑id)"""
    def __init__(self, csv_path: str, max_len=256, add_eos=True):
        df = pd.read_csv(csv_path, usecols=["chords"]).drop_duplicates().dropna()

        vocab, seqs = set(), []
        for raw in df["chords"]:
            seq = []
            for ph in split_phrases(raw):
                seq.extend([t for t in clean_tokens(ph) if t])
            if not seq: continue
            if add_eos: seq.append("<eos>")
            seq = seq[:max_len]
            vocab.update(seq)
            seqs.append(seq)

        specials = ["<pad>", "<unk>", "<bos>", "<eos>"]
        self.token2idx = {tok: i for i, tok in enumerate(specials +
                                                          sorted(vocab - set(specials)))}
        self.idx2tok   = {i: t for t, i in self.token2idx.items()}

        self.samples = [torch.tensor([self.token2idx[t] for t in s],
                                     dtype=torch.long)
                        for s in seqs]

    def __len__(self):           return len(self.samples)
    def __getitem__(self, idx):  return {"input_ids": self.samples[idx]}

csv_path = "chordonomicon_v2.csv"
raw_ds   = ChordDataset(csv_path, max_len=128)

In [None]:
n = len(raw_ds)
n_train, n_val   = int(0.8*n), int(0.1*n)
n_test           = n - n_train - n_val
train_ds, val_ds, test_ds = random_split(raw_ds,
    [n_train, n_val, n_test],
    generator=torch.Generator().manual_seed(42)
)

print(f"Total {n}  | Train {len(train_ds)} | Valid {len(val_ds)} | Test {len(test_ds)}")
print(f"Vocab = {len(raw_ds.token2idx)} tokens")

Total 677199  | Train 541759 | Valid 67719 | Test 67721
Vocab = 4264 tokens


# Tokenization

In [None]:
tok_json = "chord_tokenizer.json"
if not os.path.exists(tok_json):
    tk = Tokenizer(WordLevel(raw_ds.token2idx, unk_token="<unk>"))
    tk.pre_tokenizer = Whitespace()
    tk.save(tok_json)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=tok_json,
    pad_token="<pad>", unk_token="<unk>",
    bos_token="<bos>", eos_token="<eos>"
)

# Create Model

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(tokenizer.vocab_size)

# ย่อ positional‑embedding เหลือ 128
old_wpe = model.transformer.wpe.weight.data
model.transformer.wpe = torch.nn.Embedding(128, old_wpe.size(1))
model.transformer.wpe.weight.data = old_wpe[:128].clone()
model.config.n_positions = model.config.n_ctx = 128


model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(4264, 768)
    (wpe): Embedding(128, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=4264, bias=False)
)

# Train Model

In [None]:
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8
)

In [None]:
args = TrainingArguments(
    output_dir="gpt2-chords",
    c=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collator
)

In [None]:
trainer.train()
trainer.save_model("gpt2-chords-final")
tokenizer.save_pretrained("gpt2-chords-final")
print("✅ Model & tokenizer saved to gpt2-chords-final")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.914
200,2.2103
300,1.7584
400,1.4834
500,1.337
600,1.2646
700,1.2244
800,1.1973
900,1.1606
1000,1.1329


✅ Model & tokenizer saved to gpt2-chords-final


# Evaluation & Metrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch, math, torch.nn.functional as F
from torch.utils.data import DataLoader

loader = DataLoader(
    test_ds,
    batch_size=64,
    collate_fn=collator,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
    persistent_workers=True,
)

model_dir = "gpt2-chords-final"
tokenizer  = PreTrainedTokenizerFast.from_pretrained(model_dir)
model      = AutoModelForCausalLM.from_pretrained(model_dir)
collator   = DataCollatorForLanguageModeling(
                 tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

device = torch.device("cuda")
model = model.half().to(device).eval()
torch.backends.cudnn.benchmark = True
try:
    model = torch.compile(model, mode="reduce-overhead")
except: pass

total_nll = 0.0
total_tok = 0
top1_hit  = 0
top5_hit  = 0
top8_hit  = 0
pad_id    = tokenizer.pad_token_id

with torch.no_grad():
    for batch in loader:
        inp = batch["input_ids"].to(device, non_blocking=True)
        out = model(inp)
        logits = out.logits[:, :-1, :]
        labels = inp[:, 1:]
        mask   = labels != pad_id

        # NLL
        loss = F.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            labels.reshape(-1),
            ignore_index=pad_id,
            reduction="sum"
        )
        total_nll += loss.item()
        total_tok += mask.sum().item()

        # Top-k
        topks = logits.topk(8, dim=-1).indices  # [B, L, 8]
        corr  = (topks == labels.unsqueeze(-1)) & mask.unsqueeze(-1)
        top1_hit += corr[..., :1].any(-1).sum().item()
        top5_hit += corr[..., :5].any(-1).sum().item()
        top8_hit += corr.any(-1).sum().item()

# สรุป
ppl  = math.exp(total_nll / total_tok)
acc1 = top1_hit / total_tok
acc5 = top5_hit / total_tok
acc8 = top8_hit / total_tok

print(f"PPL={ppl:.3f} | top1={acc1:.2%} | top5={acc5:.2%} | top8={acc8:.2%}")

W0507 09:34:26.596000 184 torch/_inductor/utils.py:1137] [0/3] Not enough SMs to use max_autotune_gemm mode


PPL=2.393 | top1=76.62% | top5=94.94% | top8=96.83%


Circle of fifts

In [None]:
import re, math, numpy as np, torch, music21 as m21
from torch.utils.data import DataLoader
from transformers import (AutoModelForCausalLM,
                          PreTrainedTokenizerFast,
                          DataCollatorForLanguageModeling)

ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {"C":0,"C#":1,"Db":1,"D":2,"D#":3,"Eb":3,"E":4,"F":5,"F#":6,"Gb":6,
           "G":7,"G#":8,"Ab":8,"A":9,"A#":10,"Bb":10,"B":11,"Cb":11}

def parse_root(tok:str)->int|None:
    """ดึง root-note → 0-11  (None ถ้าไม่รู้จัก)"""
    m = ROOT_RE.match(tok.strip())
    if not m: return None
    note = m.group(1).upper() + m.group(2).replace('♯','#').replace('♭','b')
    return PC_NUM.get(note)

def circ_dist(a:int,b:int)->int:
    """ระยะสั้นสุดบนวง 12-tone (0-6 semitones)"""
    diff = abs(a-b)
    return min(diff, 12-diff)

class Seed4Dataset(torch.utils.data.Dataset):
    """เก็บ (seed_ids[4], target_ids[32]) ต่อเพลง"""
    def __init__(self, base_ds, seed_len=4, gen_len=32):
        self.samples = []
        for d in base_ds:
            ids = d["input_ids"]
            if len(ids) >= seed_len + gen_len:
                self.samples.append((ids[:seed_len], ids[seed_len:seed_len+gen_len]))
    def __len__(self):              return len(self.samples)
    def __getitem__(self, idx):     return self.samples[idx]

model_dir = "drive/MyDrive/project-main/gpt2-chords-final"
tok = PreTrainedTokenizerFast.from_pretrained(model_dir)
tok.padding_side = "left"
tok.pad_token = "<pad>"
model     = AutoModelForCausalLM.from_pretrained(model_dir)

old_wpe = model.transformer.wpe          # [128, 768]
new_len = 512
new_wpe = torch.nn.Embedding(new_len, old_wpe.embedding_dim)
new_wpe.weight.data[:old_wpe.num_embeddings] = old_wpe.weight.data.clone()
model.transformer.wpe = new_wpe
model.config.n_positions = new_len
model.config.n_ctx       = new_len
model.config.max_position_embeddings = new_len

collator  = DataCollatorForLanguageModeling(
               tokenizer=tok, mlm=False, pad_to_multiple_of=8)

device = torch.device("cuda")
model.to(device).eval()

seed_ds = Seed4Dataset(test_ds, 4, 32)
def collate(batch):
    seeds   = [torch.tensor(s[0]) for s in batch]
    targets = [torch.tensor(s[1]) for s in batch]
    seeds   = torch.nn.utils.rnn.pad_sequence(
                  seeds, batch_first=True, padding_value=tok.pad_token_id)
    return {"seed": seeds, "gold": targets}

loader = DataLoader(seed_ds, batch_size=64, collate_fn=collate,
                    shuffle=False, num_workers=4, pin_memory=True)

pred_step_sum = gold_step_sum = 0
pred_step_cnt = gold_step_cnt = 0

with torch.inference_mode():
    for batch in loader:
        seed_ids = batch["seed"].to("cuda")
        gold_ids = batch["gold"]

        gen_ids =model.generate(seed_ids,max_new_tokens=32,do_sample=True,
                           top_p=0.9,temperature=1.1,
                           no_repeat_ngram_size=3,repetition_penalty=1.1,
                           pad_token_id=tok.pad_token_id,
                           eos_token_id=tok.eos_token_id)

        for gen, gold in zip(gen_ids.cpu(), gold_ids):
            pred_tokens = [t for t in
                           tok.convert_ids_to_tokens(gen.tolist(),
                                                     skip_special_tokens=True)
                           if t not in ("<bos>","<pad>","<eos>")][-32:]
            gold_tokens = tok.convert_ids_to_tokens(gold.tolist())

            for a, b in zip(gold_tokens[:-1], gold_tokens[1:]):
                ra, rb = parse_root(a), parse_root(b)
                if ra is not None and rb is not None:
                    gold_step_sum += circ_dist(ra, rb)
                    gold_step_cnt += 1

            for a, b in zip(pred_tokens[:-1], pred_tokens[1:]):
                ra, rb = parse_root(a), parse_root(b)
                if ra is not None and rb is not None:
                    pred_step_sum += circ_dist(ra, rb)
                    pred_step_cnt += 1

avg_gold = gold_step_sum / gold_step_cnt
avg_pred = pred_step_sum / pred_step_cnt

print(f"‣ Gold 32-chord step distance  = {avg_gold:.3f} semitones")
print(f"‣ Pred 32-chord step distance  = {avg_pred:.3f} semitones")

  seeds   = [torch.tensor(s[0]) for s in batch]
  seeds   = [torch.tensor(s[0]) for s in batch]
  seeds   = [torch.tensor(s[0]) for s in batch]
  targets = [torch.tensor(s[1]) for s in batch]
  targets = [torch.tensor(s[1]) for s in batch]
  seeds   = [torch.tensor(s[0]) for s in batch]
  targets = [torch.tensor(s[1]) for s in batch]
  targets = [torch.tensor(s[1]) for s in batch]


‣ Gold 32-chord step distance  = 3.551 semitones
‣ Pred 32-chord step distance  = 3.175 semitones


CD, CM, TS

In [None]:

import re, math, numpy as np, torch, music21 as m21
from functools import lru_cache
from torch.utils.data import DataLoader
from transformers import (AutoModelForCausalLM,
                          PreTrainedTokenizerFast,
                          DataCollatorForLanguageModeling)

ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {'C':0,'C#':1,'Db':1,'D':2,'D#':3,'Eb':3,'E':4,'F':5,'F#':6,'Gb':6,
           'G':7,'G#':8,'Ab':8,'A':9,'A#':10,'Bb':10,'B':11,'Cb':11}

def circ_dist(a:int,b:int)->int:      # 0-6
    d=abs(a-b); return min(d,12-d)

@lru_cache(maxsize=4096)
def chord_vec(token:str)->np.ndarray:
    pcs=set()
    try:
        cs=m21.harmony.ChordSymbol(token)
        pcs={p.pitchClass for p in cs.pitches}
    except Exception:
        m=ROOT_RE.match(token)
        if m:
            root=PC_NUM[m.group(1).upper()+m.group(2).replace('♯','#').replace('♭','b')]
            pcs={root,(root+7)%12}
            pcs.add((root+3)%12 if 'm' in token.lower() else (root+4)%12)
        else:
            pcs={0}
    v=np.zeros(12); v[list(pcs)]=1
    return v/len(pcs)

def cloud_diam(pcs:set[int])->int:
    if len(pcs)<2: return 0
    return max(circ_dist(a,b) for a in pcs for b in pcs)

def quick_key(chords:list[str])->np.ndarray:
    """root-histogram → tonic triad vector"""
    roots=[c[0].upper() for c in chords if c]
    root=max(set(roots),key=roots.count) if roots else 'C'
    mode='minor' if any(c.lower().endswith(('m','min')) for c in chords) else 'major'
    return chord_vec(f"{root}{'m' if mode=='minor' else ''}")

class Seed4(torch.utils.data.Dataset):
    def __init__(self, base, seed=4, gen=32):
        self.items=[(d["input_ids"][:seed],d["input_ids"][seed:seed+gen])
                    for d in base if len(d["input_ids"])>=seed+gen]
    def __len__(self):  return len(self.items)
    def __getitem__(self,i):
        s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}

def collate(batch):
    seeds=[b["seed"] for b in batch]
    golds=[b["gold"] for b in batch]
    seeds=torch.nn.utils.rnn.pad_sequence(
        seeds,batch_first=True,padding_value=tok.pad_token_id)
    return{"seed":seeds,"gold":golds}


model_dir="drive/MyDrive/project-main/gpt2-chords-final"
tok = PreTrainedTokenizerFast.from_pretrained(model_dir)
tok.padding_side, tok.pad_token="left","<pad>"
model = AutoModelForCausalLM.from_pretrained(model_dir).half().to("cuda").eval()

seed_ds  = Seed4(test_ds,4,32)
loader   = DataLoader(seed_ds,batch_size=64,collate_fn=collate,
                      shuffle=False,num_workers=4,pin_memory=True)

def bucket(): return {"cd":0,"cm":0,"ts":0,"n_cd":0,"n_cm":0,"n_ts":0}
gold, pred = bucket(), bucket()

def update(tokens,acc,tonic_vec):
    prev=None
    for t in tokens:
        v=chord_vec(t); pcs=np.where(v>0)[0]
        acc["cd"]+=cloud_diam(pcs); acc["n_cd"]+=1
        acc["ts"]+=np.linalg.norm(v-tonic_vec); acc["n_ts"]+=1
        if prev is not None:
            acc["cm"]+=np.linalg.norm(v-prev); acc["n_cm"]+=1
        prev=v

with torch.inference_mode():
    for batch in loader:
        seed=batch["seed"].to("cuda")
        gen_ids =model.generate(seed,max_new_tokens=32,do_sample=True,
                           top_p=0.9,temperature=1.1,
                           no_repeat_ngram_size=3,repetition_penalty=1.1,
                           pad_token_id=tok.pad_token_id,
                           eos_token_id=tok.eos_token_id)

        for g_ids,p_ids in zip(batch["gold"], gen.cpu()):
            gold_tok=tok.convert_ids_to_tokens(g_ids.tolist())
            pred_tok=[t for t in tok.convert_ids_to_tokens(
                        p_ids.tolist(),skip_special_tokens=True)
                      if t not in ("<bos>","<pad>","<eos>")][-32:]

            update(gold_tok, gold, quick_key(gold_tok))
            update(pred_tok, pred, quick_key(pred_tok))


def avg(acc,key): return acc[key]/acc[f"n_{key}"] if acc[f"n_{key}"] else 0
print(f"{'Metric':6} | Gold | Pred")
print("-"*28)
for m in ("cd","cm","ts"):
    print(f"{m.upper():6} | {avg(gold,m):4.2f} | {avg(pred,m):4.2f}")

  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}


Metric | Gold | Pred
----------------------------
CD     | 5.04 | 5.15
CM     | 0.66 | 0.56
TS     | 0.57 | 0.42


Key Consistency

In [None]:
import re, numpy as np, torch, music21 as m21
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast

ROOT_RE = re.compile(r"^([A-Ga-g])([#b♯♭]?)(.*)$")
PC_NUM  = {'C':0,'C#':1,'Db':1,'D':2,'D#':3,'Eb':3,'E':4,'F':5,'F#':6,'Gb':6,
           'G':7,'G#':8,'Ab':8,'A':9,'A#':10,'Bb':10,'B':11,'Cb':11}

def simplify(tok:str)->str|None:
    """
    ตัดเหลือ root+quality หลัก ('' / m / dim)
    Cmaj7(#11) → C,  F#m7 → F#m, Bdim7 → Bdim
    """
    m = ROOT_RE.match(tok)
    if not m: return None
    root, acc, qual = m.groups()
    root = root.upper() + acc.replace('♯','#').replace('♭','b')
    qual = qual.lower()
    if qual.startswith(('m','min')):   q = 'm'
    elif 'dim' in qual or 'o' in qual: q = 'dim'
    else:                              q = ''
    return root + q

def diatonic_set(key:m21.key.Key)->set[str]:
    """สร้างเซ็ต root+qual ('C','Dm','Edim',…) ที่อยู่ในคีย์"""
    major_map = {1:'',2:'m',3:'m',4:'',5:'',6:'m',7:'dim'}
    minor_map = {1:'m',2:'dim',3:'',4:'m',5:'m',6:'',7:''}
    scale = (m21.scale.MajorScale if key.mode=='major'
             else m21.scale.MinorScale)(key.tonic)
    mapping = major_map if key.mode=='major' else minor_map
    diat = set()
    for deg,q in mapping.items():
        root = scale.pitchFromDegree(deg).name.replace('-', 'b')
        diat.add(root+q)
    return diat

def guess_key(chords:list[str])->m21.key.Key:
    roots = [c[0].upper() for c in chords if c]
    root  = max(set(roots), key=roots.count) if roots else 'C'
    mode  = 'minor' if any(c.lower().endswith(('m','min')) for c in chords) else 'major'
    return m21.key.Key(root, mode)

seed_ds  = Seed4(test_ds,4,32)
def coll(batch):
    seeds=[b["seed"] for b in batch]
    golds=[b["gold"] for b in batch]
    seeds=torch.nn.utils.rnn.pad_sequence(
        seeds,batch_first=True,padding_value=tok.pad_token_id)
    return {"seed":seeds,"gold":golds}

model_dir = "drive/MyDrive/project-main/gpt2-chords-final"
tok   = PreTrainedTokenizerFast.from_pretrained(model_dir)
tok.padding_side, tok.pad_token = "left","<pad>"
model = AutoModelForCausalLM.from_pretrained(model_dir).half().to("cuda").eval()

loader = DataLoader(seed_ds,batch_size=64,collate_fn=coll,
                    shuffle=False,num_workers=4,pin_memory=True)

gold_in = gold_all = pred_in = pred_all = 0  # ตัวนับ

with torch.inference_mode():
    for batch in loader:
        seed = batch["seed"].to("cuda")
        gen_ids =model.generate(seed,max_new_tokens=32,do_sample=True,
                           top_p=0.9,temperature=1.1,
                           no_repeat_ngram_size=3,repetition_penalty=1.1,
                           pad_token_id=tok.pad_token_id,
                           eos_token_id=tok.eos_token_id)

        for g_ids,p_ids in zip(batch["gold"], gen.cpu()):
            gold_tok = tok.convert_ids_to_tokens(g_ids.tolist())
            pred_tok = [t for t in tok.convert_ids_to_tokens(
                          p_ids.tolist(),skip_special_tokens=True)
                        if t not in ("<bos>","<pad>","<eos>")][-32:]

            key_g   = guess_key(gold_tok)
            diat_g  = diatonic_set(key_g)
            for t in gold_tok:
                simp = simplify(t)
                if simp is None: continue
                gold_all += 1
                if simp in diat_g:
                    gold_in += 1

            key_p   = guess_key(pred_tok)
            diat_p  = diatonic_set(key_p)
            for t in pred_tok:
                simp = simplify(t)
                if simp is None: continue
                pred_all += 1
                if simp in diat_p:
                    pred_in += 1

kc_gold = gold_in / gold_all if gold_all else 0.0
kc_pred = pred_in / pred_all if pred_all else 0.0
print(f"Key-Consistency  |  Gold = {kc_gold:.2%}   Pred = {kc_pred:.2%}")

  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}


Key-Consistency  |  Gold = 36.30%   Pred = 69.23%


CHE (Chords Histrogram entropy
)

In [None]:
import math, torch
from torch.utils.data import DataLoader
from collections import Counter
from transformers import (AutoModelForCausalLM,
                          PreTrainedTokenizerFast)

seed_ds  = Seed4(test_ds, 4, 32)

def collate(batch):
    seeds=[b["seed"] for b in batch]
    golds=[b["gold"] for b in batch]
    seeds=torch.nn.utils.rnn.pad_sequence(
        seeds, batch_first=True, padding_value=tok.pad_token_id)
    return {"seed":seeds, "gold":golds}

model_dir = "drive/MyDrive/project-main/gpt2-chords-final"
tok   = PreTrainedTokenizerFast.from_pretrained(model_dir)
tok.padding_side, tok.pad_token = "left","<pad>"
model = AutoModelForCausalLM.from_pretrained(model_dir).half().to("cuda").eval()

loader = DataLoader(seed_ds, batch_size=64, collate_fn=collate,
                    shuffle=False, num_workers=4, pin_memory=True)

gold_counter = Counter()
pred_counter = Counter()
gold_total = pred_total = 0

with torch.inference_mode():
    for batch in loader:
        seed = batch["seed"].to("cuda")
        gen = model.generate(seed,max_new_tokens=32,do_sample=True,
                           top_p=0.9,temperature=1.1,
                           no_repeat_ngram_size=3,repetition_penalty=1.1,
                           pad_token_id=tok.pad_token_id,
                           eos_token_id=tok.eos_token_id)

        for g_ids, p_ids in zip(batch["gold"], gen.cpu()):
            gold_tok = tok.convert_ids_to_tokens(g_ids.tolist())
            gold_counter.update(gold_tok);  gold_total += len(gold_tok)

            pred_tok = [t for t in tok.convert_ids_to_tokens(
                          p_ids.tolist(), skip_special_tokens=True)
                        if t not in ("<bos>","<pad>","<eos>")][-32:]
            pred_counter.update(pred_tok);  pred_total += len(pred_tok)

def entropy(counter, total):
    return -sum((c/total) * math.log2(c/total)
                for c in counter.values() if c)

che_gold = entropy(gold_counter, gold_total)
che_pred = entropy(pred_counter, pred_total)

print(f"Chord-Histogram Entropy")
print(f"  Gold-32 : {che_gold:.3f} bits")
print(f"  Pred-32 : {che_pred:.3f} bits")

  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}
  s,g=self.items[i];return{"seed":torch.tensor(s),"gold":torch.tensor(g)}


Chord-Histogram Entropy
  Gold-32 : 5.039 bits
  Pred-32 : 6.863 bits
