### **FINAL PIPELINE**

In [1]:
# =============================================================
# NLBSE'26 – FINAL DISTILLATION PIPELINE
# OFFICIAL-COMPATIBLE RUNTIME + GFLOPs MEASUREMENT
# =============================================================

import os
import time
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from torch.profiler import profile, ProfilerActivity

# -----------------------------
# ENV
# -----------------------------
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# CONFIG
# -----------------------------
TEACHER_MODEL = "microsoft/codebert-base"
STUDENT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

SEED = 42
VAL_SPLIT = 0.12
LABEL_SMOOTHING = 0.03

TEACHER_EPOCHS = {"java": 12, "python": 25, "pharo": 25}
STUDENT_EPOCHS = {"java": 25, "python": 40, "pharo": 50}

MAX_LEN_T = 128
MAX_LEN_S = 64
BATCH_SIZE_INFER = 32
N_RUNS = 10

langs = ["java", "python", "pharo"]

LABELS = {
    "java": ["summary","Ownership","Expand","usage","Pointer","deprecation","rational"],
    "python": ["Usage","Parameters","DevelopmentNotes","Expand","Summary"],
    "pharo": ["Keyimplementationpoints","Example","Responsibilities","Intent","Keymessages","Collaborators"],
}
SEMANTIC_LABELS = {
    "java": {"Expand", "rational"},
    "python": {"DevelopmentNotes", "Expand"},
    "pharo": {"Responsibilities", "Intent", "Collaborators"},
}
BOOSTERS = {
    "java": "/content/java_semantic_boosters.csv",
    "python": "/content/python_semantic_boosters.csv",
    "pharo": "/content/pharo_semantic_boosters.csv",
}

# -----------------------------
# LOAD DATA
# -----------------------------
ds_all = load_dataset("NLBSE/nlbse26-code-comment-classification")

# -----------------------------
# UTILS
# -----------------------------
def sigmoid_np(x):
    return 1 / (1 + np.exp(-x))

def compute_pos_weight(labels, power=0.75):
    arr = np.array(labels)
    pos = arr.sum(axis=0)
    neg = len(arr) - pos
    return torch.tensor((neg / (pos + 1e-9)) ** power, dtype=torch.float32)

def macro_f1(y_true, y_pred):
    f1s = []
    for j in range(y_true.shape[1]):
        yt, yp = y_true[:, j], y_pred[:, j]
        tp = np.sum((yt==1)&(yp==1))
        fp = np.sum((yt==0)&(yp==1))
        fn = np.sum((yt==1)&(yp==0))
        denom = 2*tp+fp+fn
        f1s.append((2*tp/denom) if denom>0 else 0)
    return float(np.mean(f1s))

def tune_thresholds_two_stage(logits, y_true, steps1=15, steps2=20):
    probs = sigmoid_np(logits)
    y_true = np.array(y_true)

    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.2, 0.6, steps1):
        preds = (probs >= t).astype(int)
        f1 = macro_f1(y_true, preds)
        if f1 > best_f1:
            best_f1, best_t = f1, t

    thresholds = np.full(probs.shape[1], best_t)
    for j in range(probs.shape[1]):
        for t in np.linspace(max(0.05, best_t - 0.2), min(0.95, best_t + 0.2), steps2):
            tmp = thresholds.copy()
            tmp[j] = t
            preds = (probs >= tmp).astype(int)
            f1 = macro_f1(y_true, preds)
            if f1 > best_f1:
                best_f1 = f1
                thresholds[j] = t

    return thresholds

# -----------------------------
# KD TRAINER
# -----------------------------
class DistillTrainer(Trainer):
    def __init__(self, pos_weight, kd_alpha=0.6, kd_label_weights=None, **kw):
        super().__init__(**kw)
        self.pos_weight = pos_weight
        self.kd_alpha = kd_alpha
        self.kd_label_weights = kd_label_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        teacher_logits = inputs.pop("teacher_logits", None)
        labels = inputs.pop("labels").float().to(model.device)

        out = model(**inputs)
        logits = out.logits

        smooth = labels * (1-LABEL_SMOOTHING) + 0.5 * LABEL_SMOOTHING
        loss_hard = F.binary_cross_entropy_with_logits(
            logits, smooth, pos_weight=self.pos_weight.to(logits.device)
        )

        if teacher_logits is not None:
            t = teacher_logits.to(logits.device)

            # <<< FIX 1: PER-SAMPLE STANDARDIZATION >>>
            t = (t - t.mean(dim=1, keepdim=True)) / (t.std(dim=1, keepdim=True) + 1e-6)
            s = (logits - logits.mean(dim=1, keepdim=True)) / (logits.std(dim=1, keepdim=True) + 1e-6)

            # <<< FIX 2: MSE DIRECTLY ON STANDARDIZED LOGITS >>>
            kd_raw = (s - t) ** 2

            if self.kd_label_weights is not None:
                kd_raw = kd_raw * self.kd_label_weights.to(kd_raw.device)

            loss_kd = kd_raw.mean()
            loss = self.kd_alpha * loss_kd + (1 - self.kd_alpha) * loss_hard
        else:
            loss = loss_hard

        return (loss, out) if return_outputs else loss

# -----------------------------
# TOKENIZERS
# -----------------------------
tok_t = AutoTokenizer.from_pretrained(TEACHER_MODEL)
tok_s = AutoTokenizer.from_pretrained(STUDENT_MODEL)

def tok_teacher(b):
    out = tok_t(
        b["combo"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN_T
    )
    out["labels"] = [[float(x) for x in lbl] for lbl in b["labels"]]
    return out

# -----------------------------
# OFFICIAL-STYLE MEASUREMENT
# -----------------------------
def measure_runtime_and_gflops(model, dataset):
    model.eval()
    torch.cuda.empty_cache()
    if device.type == "cuda":
        torch.cuda.synchronize()

    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=BATCH_SIZE_INFER,
        shuffle=False,
        pin_memory=True
    )

    begin = time.time()
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        with_flops=True,
    ) as prof:
        with torch.no_grad():
            for _ in range(N_RUNS):
                for batch in loader:
                    inputs = {
                        "input_ids": batch["input_ids"].to(device, non_blocking=True),
                        "attention_mask": batch["attention_mask"].to(device, non_blocking=True),
                    }
                    _ = model(**inputs)

    if device.type == "cuda":
        torch.cuda.synchronize()

    total_time = time.time() - begin
    total_flops = sum(e.flops for e in prof.key_averages()) / 1e9

    return total_time / N_RUNS, total_flops / N_RUNS

# -----------------------------
# MAIN LOOP
# -----------------------------
all_scores = []
total_runtime = 0.0
total_gflops = 0.0

for lang in langs:
    print(f"\n=== {lang.upper()} ===")

    labels = LABELS[lang]

    #base = concatenate_datasets([ds_all[f"{lang}_train"], ds_aug[f"{lang}_train"]])
    base = ds_all[f"{lang}_train"]
    split = base.train_test_split(test_size=VAL_SPLIT, seed=SEED)
    train_raw, val_raw = split["train"], split["test"]
    test_raw = ds_all[f"{lang}_test"]

    # =============================
    # TEACHER TRAINING (+ BOOSTERS)
    # =============================
    train_teacher_raw = train_raw
    path = BOOSTERS.get(lang)
    if path and os.path.exists(path):
        dfb = pd.read_csv(path)
        dfb["labels"] = dfb["labels"].apply(eval)
        booster = Dataset.from_pandas(dfb).select_columns(["combo", "labels"])
        train_teacher_raw = concatenate_datasets([train_raw, booster])

    pos_t = compute_pos_weight(train_teacher_raw["labels"])

    tt = train_teacher_raw.map(tok_teacher, batched=True, remove_columns=train_teacher_raw.column_names)
    from datasets import Sequence, Value
    tt = tt.cast_column("labels", Sequence(Value("float32")))
    tt.set_format("torch")


    class TeacherTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            labels = inputs.pop("labels").to(model.device)
            outputs = model(**inputs)
            logits = outputs.logits
            loss = F.binary_cross_entropy_with_logits(
                logits, labels, pos_weight=pos_t.to(logits.device)
            )
            return (loss, outputs) if return_outputs else loss

    teacher = AutoModelForSequenceClassification.from_pretrained(
        TEACHER_MODEL,
        num_labels=len(labels),
        problem_type="multi_label_classification"
    ).to(device)

    t_trainer = TeacherTrainer(
        model=teacher,
        args=TrainingArguments(
            output_dir=f"teacher_{lang}",
            per_device_train_batch_size=16,
            num_train_epochs=TEACHER_EPOCHS[lang],
            fp16=True,
            save_strategy="no",
            report_to=[]
        ),
        train_dataset=tt
    )

    t_trainer.train()
    teacher_logits = t_trainer.predict(tt).predictions

    # =============================
    # STUDENT DATA (KD)
    # =============================
    def tok_student_kd(b, idx):
        out = tok_s(
            b["combo"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN_S
        )
        out["labels"] = b["labels"]
        out["teacher_logits"] = teacher_logits[idx]
        return out

    ts = train_raw.map(tok_student_kd, with_indices=True, batched=True, remove_columns=train_raw.column_names)

    vs = val_raw.map(
        lambda b, i: tok_student_kd(b, i),
        with_indices=True,
        batched=True,
        remove_columns=val_raw.column_names
    )

    te = test_raw.map(
        lambda b: tok_s(
            b["combo"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN_S
        ) | {"labels": b["labels"]},
        batched=True,
        remove_columns=test_raw.column_names
    )

    ts.set_format("torch")
    vs.set_format("torch")
    te.set_format("torch")

    pos_s = compute_pos_weight(train_raw["labels"])

    kd_weights = torch.ones(len(labels))
    for i, lbl in enumerate(labels):
        if lbl in SEMANTIC_LABELS.get(lang, set()):
            kd_weights[i] = 1.5

    student = AutoModelForSequenceClassification.from_pretrained(
        STUDENT_MODEL,
        num_labels=len(labels),
        problem_type="multi_label_classification"
    ).to(device)

    trainer = DistillTrainer(
        model=student,
        args=TrainingArguments(
            output_dir=f"student_{lang}",
            per_device_train_batch_size=32,
            num_train_epochs=STUDENT_EPOCHS[lang],
            fp16=True,
            save_strategy="no",
            report_to=[]
        ),
        train_dataset=ts,
        pos_weight=pos_s,
        kd_alpha=0.65,
        kd_label_weights=kd_weights
    )

    trainer.train()

    avg_rt, avg_fl = measure_runtime_and_gflops(student, te)
    total_runtime += avg_rt
    total_gflops += avg_fl

    # <<< FIX 4: THRESHOLDS TUNED ON VAL >>>
    val_logits = trainer.predict(vs).predictions
    thresholds = tune_thresholds_two_stage(val_logits, val_raw["labels"])

    test_logits = trainer.predict(te).predictions
    preds = (sigmoid_np(test_logits) >= thresholds).astype(int)

    y_true = np.array(test_raw["labels"])
    for i, lbl in enumerate(labels):
        yt, yp = y_true[:, i], preds[:, i]
        tp = np.sum((yt==1)&(yp==1))
        fp = np.sum((yt==0)&(yp==1))
        fn = np.sum((yt==1)&(yp==0))
        f1 = (2*tp)/(2*tp+fp+fn) if (tp+fp+fn)>0 else 0
        all_scores.append({"language": lang, "category": lbl, "f1": f1})

# -----------------------------
# FINAL REPORT
# -----------------------------
df = pd.DataFrame(all_scores)
avg_f1 = df["f1"].mean()

print(df)
print("\nMACRO F1:", avg_f1)
print("Avg runtime (s):", total_runtime / len(langs))
print("Avg GFLOPs:", total_gflops / len(langs))


Using device: cuda


README.md: 0.00B [00:00, ?B/s]

data/java_train-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

data/java_test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

data/pharo_train-00000-of-00001.parquet:   0%|          | 0.00/79.6k [00:00<?, ?B/s]

data/pharo_test-00000-of-00001.parquet:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

data/python_train-00000-of-00001.parquet:   0%|          | 0.00/95.8k [00:00<?, ?B/s]

data/python_test-00000-of-00001.parquet:   0%|          | 0.00/27.2k [00:00<?, ?B/s]

Generating java_train split:   0%|          | 0/5394 [00:00<?, ? examples/s]

Generating java_test split:   0%|          | 0/1201 [00:00<?, ? examples/s]

Generating pharo_train split:   0%|          | 0/900 [00:00<?, ? examples/s]

Generating pharo_test split:   0%|          | 0/208 [00:00<?, ? examples/s]

Generating python_train split:   0%|          | 0/1368 [00:00<?, ? examples/s]

Generating python_test split:   0%|          | 0/290 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


=== JAVA ===


Map:   0%|          | 0/4786 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4786 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Step,Training Loss
500,0.3576
1000,0.1465
1500,0.0759
2000,0.0318
2500,0.022
3000,0.0095
3500,0.0054


Map:   0%|          | 0/4746 [00:00<?, ? examples/s]

Map:   0%|          | 0/648 [00:00<?, ? examples/s]

Map:   0%|          | 0/1201 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6073
1000,0.4205
1500,0.3759
2000,0.3612
2500,0.3561
3000,0.3535
3500,0.3522



=== PYTHON ===


Map:   0%|          | 0/1223 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1223 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.4065
1000,0.0561
1500,0.0078


Map:   0%|          | 0/1203 [00:00<?, ? examples/s]

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.4399
1000,0.1809
1500,0.1654



=== PHARO ===


Map:   0%|          | 0/812 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/812 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.2882
1000,0.0194


Map:   0%|          | 0/792 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.4374
1000,0.2261


   language                 category        f1
0      java                  summary  0.887490
1      java                Ownership  0.982456
2      java                   Expand  0.409091
3      java                    usage  0.860034
4      java                  Pointer  0.870504
5      java              deprecation  0.615385
6      java                 rational  0.318182
7    python                    Usage  0.717949
8    python               Parameters  0.793103
9    python         DevelopmentNotes  0.475000
10   python                   Expand  0.563636
11   python                  Summary  0.620690
12    pharo  Keyimplementationpoints  0.567568
13    pharo                  Example  0.868132
14    pharo         Responsibilities  0.660377
15    pharo                   Intent  0.863636
16    pharo              Keymessages  0.591549
17    pharo            Collaborators  0.375000

MACRO F1: 0.6688767561556634
Avg runtime (s): 0.8244324366251629
Avg GFLOPs: 769.9720276480001


In [2]:
max_avg_runtime = 5
max_avg_flops = 5000
avg_runtime = total_runtime / len(langs)
avg_flops = total_gflops / len(langs)
# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

print("FINAL SUBMISSION SCORE   :", {round(score(avg_f1, avg_runtime, avg_flops),2)})
print("-" * 40)



FINAL SUBMISSION SCORE   : {np.float64(0.74)}
----------------------------------------
