## Data Preparation

In [18]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

In [19]:
class NERDataset(Dataset):
    def __init__(self, data_path, tokenizer, label_pad_id=-100, max_length=128):
        with open(data_path, "r", encoding="utf-8") as f:
            raw = json.load(f)["examples"]
        self.data = raw
        self.tokenizer = tokenizer
        self.label_pad_id = label_pad_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        ner_tags = self.data[idx]["ner_tags"]

        # buat encoding untuk tokens 
        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # align labels dengan tokens yang sudah diencoding (jadi kepotong2 sesuai tokenization)
        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(self.label_pad_id)
            elif word_idx != previous_word_idx:
                aligned_labels.append(ner_tags[word_idx])
            else:
                aligned_labels.append(self.label_pad_id)
            previous_word_idx = word_idx
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(aligned_labels, dtype=torch.long)
        }

        return item

In [20]:
def load_label_info(model_name):
    config = AutoConfig.from_pretrained(model_name)
    id2label = config.id2label
    label2id = config.label2id
    num_labels = config.num_labels

    label_info = {
        "id2label": id2label,
        "label2id": label2id,
        "num_labels": num_labels
    }

    return label_info

def create_dataloaders(
        train_path, val_path, test_path,
        model_name,
        batch_size=32,
        max_length=128
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = NERDataset(train_path, tokenizer, max_length=max_length)
    val_dataset = NERDataset(val_path, tokenizer, max_length=max_length)
    test_dataset = NERDataset(test_path, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [21]:
train_loader, val_loader, test_loader = create_dataloaders(
    train_path=r"D:\Dafa\Project\queryner-kd\data\processed\train.json",
    val_path=r"D:\Dafa\Project\queryner-kd\data\processed\validation.json",
    test_path=r"D:\Dafa\Project\queryner-kd\data\processed\test.json",
    model_name="bltlab/queryner-augmented-data-bert-base-uncased",
    batch_size=16,
    max_length=128
)

label_info = load_label_info("bltlab/queryner-augmented-data-bert-base-uncased")

In [38]:
batch = next(iter(train_loader))
for key, value in batch.items():
    print(f"{key}: {value.shape}")

input_ids: torch.Size([16, 128])
attention_mask: torch.Size([16, 128])
labels: torch.Size([16, 128])


## Model Architecture

In [22]:
from torch import nn
from torchcrf import CRF
from transformers import AutoModel, AutoConfig

In [23]:
class CRFOutputLayer(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super().__init__()
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)

    def forward(self, outputs, labels=None, mask=None):
        emissions = self.fc(outputs)

        if labels is not None:
            # CRF requires first token to be valid, so we create a modified mask
            # that ensures first token is always included
            if mask is None:
                mask = torch.ones_like(labels, dtype=torch.bool)
            else:
                mask = mask.bool()
            
            # Ensure first position is always valid for CRF
            mask[:, 0] = True
            
            # Replace -100 with 0 (dummy label) to avoid index issues
            labels_crf = labels.clone()
            labels_crf[labels == -100] = 0
            
            # Calculate loss
            log_likelihood = self.crf(emissions, tags=labels_crf, mask=mask, reduction="mean")
            loss = -log_likelihood
            return {"logits": emissions, "loss": loss}
        else:
            if mask is None:
                mask = torch.ones(outputs.shape[:2], dtype=torch.bool, device=outputs.device)
            pred = self.crf.decode(emissions, mask=mask.bool())
            return {"logits": emissions, "pred": pred}


In [24]:
class BaseNERModel(nn.Module):
    def __init__(self, num_labels, use_crf=False):
        super().__init__()
        self.num_labels = num_labels
        self.use_crf = use_crf

    def forward(self, input_ids, attention_mask, labels=None):
        raise NotImplementedError("Forward method must be implemented in subclass.")

In [25]:
class QueryNERTeacher(BaseNERModel):
    def __init__(self, model_name, label_info, use_crf=False):
        super().__init__(num_labels=label_info["num_labels"], use_crf=use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.config.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result

        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [26]:
class DistilBERTStudent(BaseNERModel):
    def __init__(self, model_name="distilbert-base-uncased", label_info=None, use_crf=False):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(num_labels=self.num_labels, use_crf=self.use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}


In [27]:
class TinyBertStudent(BaseNERModel):
    def __init__(self, model_name="huawei-noah/TinyBERT_General_4L_312D", label_info=None, use_crf=False):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(num_labels=self.num_labels, use_crf=self.use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [54]:
class BiLSTMStudent(BaseNERModel):
    def __init__(
            self, 
            num_labels, 
            use_crf=False,
            model_name_for_vocab = 'bert-base-uncased',
            emb_dim = 300,
            lstm_hidden = 300,
            label_info = None,
            pad_token_id = 0
        ):
        super().__init__(num_labels, use_crf)
        self.use_crf = use_crf
        self.num_labels = num_labels

        self.config = AutoConfig.from_pretrained(model_name_for_vocab)
        vocab_size = self.config.vocab_size
        pad_token_id = self.config.pad_token_id

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_token_id)
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=lstm_hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Linear(lstm_hidden * 2, num_labels)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(hidden_dim=lstm_hidden * 2, num_labels=num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        emb = self.embedding(input_ids)
        emb = self.dropout(emb)
        outputs, _ = self.lstm(emb)
        sequence_output = outputs

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}


In [60]:
teacher = QueryNERTeacher(model_name="bltlab/queryner-augmented-data-bert-base-uncased", label_info=label_info, use_crf=True)
student_distilbert = DistilBERTStudent(model_name="distilbert-base-uncased", label_info=label_info, use_crf=True)
student_tinybert = TinyBertStudent(model_name="huawei-noah/TinyBERT_General_4L_312D", label_info=label_info, use_crf=True)
student_bilstm = BiLSTMStudent(num_labels=label_info["num_labels"], use_crf=True, label_info=label_info)

Some weights of BertModel were not initialized from the model checkpoint at bltlab/queryner-augmented-data-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Efficiency Evaluation

In [49]:
import torch

def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

# Example:
# total, trainable = count_parameters(teacher)
# print(f"Total params: {total:,}, Trainable: {trainable:,}")


In [50]:
from ptflops import get_model_complexity_info
import torch
import torch.nn as nn

# Create a wrapper that accepts input shape (batch, seq_len)
class ModelWrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        # model returns dict with "logits"
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # return logits (B, L, C) -> ptflops expects tensor outputs
        return out["logits"]

# Example usage:
def compute_flops_ptflops(model, seq_len=128, batch_size=1, device='cuda'):
    wrapper = ModelWrapper(model).to(device)
    # ptflops expects input resolution shape: (channels, H, W) style; but it supports custom input constructor
    # We'll pass a custom input constructor via lambda (1D sequence)
    # But simpler: create a fake input and call get_model_complexity_info with input_res=(seq_len,)
    macs, params = get_model_complexity_info(
        wrapper, (seq_len,), as_strings=False,
        print_per_layer_stat=False, verbose=False
    )
    # ptflops returns MACs (multiply-adds) usually, convert to FLOPs: FLOPs â‰ˆ 2 * MACs (approx)
    flops = 2 * macs
    return flops, params

# NOTE: ptflops may require the model forward signature to accept a single tensor; the wrapper above returns logits from dict.
# If ptflops fails with your transformer forward signature, use thop.profile with custom inputs instead.

from thop import profile
import torch

def compute_flops_thop(model, seq_len=128, batch_size=1, device='cuda', vocab_size=None):
    model = model.to(device).eval()
    input_ids = torch.randint(0, 1000, (batch_size, seq_len), dtype=torch.long, device=device)
    attention_mask = torch.ones_like(input_ids, device=device)
    macs, params = profile(model, inputs=(input_ids, attention_mask), verbose=False)
    flops = 2 * macs
    return flops, params


In [51]:
def estimate_crf_flops(batch_size, seq_len, num_tags, factor=2):
    # factor=2 for multiply+add per transition
    flops_per_seq = factor * seq_len * (num_tags ** 2)
    return batch_size * flops_per_seq


In [None]:
import time
import torch
import numpy as np

def measure_latency(model, tokenizer_or_none, device='cuda', seq_len=128, batch_size=1,
                    n_warmup=20, n_iters=200, use_attention_mask=True):
    model = model.to(device).eval()
    # prepare random inputs (use tokenizer for realistic ids if available)
    # but random ints in vocab range is fine for measuring compute
    vocab_size = getattr(model.config, "vocab_size", 30522)
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), dtype=torch.long, device=device)
    attention_mask = torch.ones_like(input_ids, device=device)

    # warmup
    with torch.no_grad():
        for _ in range(n_warmup):
            _ = model(input_ids=input_ids, attention_mask=attention_mask)

    # timed runs
    timings = []
    with torch.no_grad():
        for _ in range(n_iters):
            t0 = time.time()
            _ = model(input_ids=input_ids, attention_mask=attention_mask)
            if device.startswith('cuda'):
                torch.cuda.synchronize()
            t1 = time.time()
            timings.append((t1 - t0) * 1000.0)

    timings = np.array(timings)
    avg_ms = timings.mean()
    p50, p90, p99 = np.percentile(timings, [50, 90, 99])
    throughput = batch_size / (avg_ms / 1000.0)
    return {"avg_ms": avg_ms, "p50_ms": p50, "p90_ms": p90, "p99_ms": p99, "throughput_s": throughput}


In [61]:
models = {
    "teacher": teacher,
    "distil": student_distilbert,
    "tiny": student_tinybert,
    "bilstm": student_bilstm
}

seq_len = 128
batch_size = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for name, m in models.items():
    total_params, trainable = count_parameters(m)
    print(f"{name} params: total={total_params:,}, trainable={trainable:,}")

    flops, params = compute_flops_thop(m, seq_len=seq_len, batch_size=batch_size, device=device)
    num_tags = m.num_labels
    crf_flops = estimate_crf_flops(batch_size, seq_len, num_tags, factor=2) if getattr(m, 'use_crf', False) else 0
    print(f"{name} approx flops: {flops + crf_flops:,}")

    latency = measure_latency(m, None, device=device, seq_len=seq_len, batch_size=batch_size)
    print(f"{name} latency (ms/sample): avg {latency['avg_ms']:.2f}, p90 {latency['p90_ms']:.2f}, throughput {latency['throughput_s']:.1f} samples/s")

    print("="*80)

teacher params: total=109,511,218, trainable=109,511,218
teacher approx flops: 21,771,307,264.0
teacher latency (ms/sample): avg 22.31, p90 25.95, throughput 44.8 samples/s
distil params: total=66,391,090, trainable=66,391,090
distil approx flops: 10,889,054,464.0
distil latency (ms/sample): avg 19.26, p90 22.37, throughput 51.9 samples/s
tiny params: total=14,362,498, trainable=14,362,498
tiny approx flops: 1,171,671,424.0
tiny latency (ms/sample): avg 17.24, p90 20.70, throughput 58.0 samples/s
bilstm params: total=10,644,765, trainable=10,644,765
bilstm approx flops: 376,787,200.0
bilstm latency (ms/sample): avg 19.96, p90 22.82, throughput 50.1 samples/s


In [62]:
import json

results = {}   # store all models' results here

models = {
    "teacher": teacher,
    "distil": student_distilbert,
    "tiny": student_tinybert,
    "bilstm": student_bilstm
}

seq_len = 128
batch_size = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for name, m in models.items():
    total_params, trainable = count_parameters(m)

    # FLOPs via thop
    flops, params = compute_flops_thop(
        m,
        seq_len=seq_len,
        batch_size=batch_size,
        device=device
    )

    # CRF FLOPs
    num_tags = m.num_labels
    crf_flops = estimate_crf_flops(
        batch_size,
        seq_len,
        num_tags,
        factor=2
    ) if getattr(m, 'use_crf', False) else 0

    total_flops = flops + crf_flops

    # Latency measurement
    latency = measure_latency(
        m,
        None,
        device=device,
        seq_len=seq_len,
        batch_size=batch_size
    )

    # Store results in dict
    results[name] = {
        "parameters": {
            "total": total_params,
            "trainable": trainable,
        },
        "flops": {
            "base_flops": flops,
            "crf_flops": crf_flops,
            "total_flops": total_flops
        },
        "latency_ms": {
            "avg": latency["avg_ms"],
            "p50": latency["p50_ms"],
            "p90": latency["p90_ms"],
            "p99": latency["p99_ms"],
            "throughput_samples_per_sec": latency["throughput_s"]
        }
    }

    # Print JSON output per model
    # print(json.dumps({name: results[name]}, indent=4))
    # print("="*80)

# After the loop, you can save all results:
with open(r"results\efficiency\eff-results-crf.json", "w") as f:
    json.dump(results, f, indent=4)