## Data Preparation

In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

### Steps

In [2]:
path = r"D:\Dafa\Project\queryner-kd\data\processed\train.json"
with open(path, "r", encoding="utf-8") as f:
    raw = json.load(f)["examples"]

print(len(raw))
for item in raw[:2]:
    print(item)

7823
{'tokens': ['teeth', 'whitening', 'sensitive', 'teeth'], 'ner_tags': [9, 10, 17, 18]}
{'tokens': ['white', 'duvet', 'cover', 'queen'], 'ner_tags': [3, 9, 10, 1]}


In [3]:
tokens = raw[0]["tokens"]
ner_tags = raw[0]["ner_tags"]
print(tokens)
print(ner_tags)

['teeth', 'whitening', 'sensitive', 'teeth']
[9, 10, 17, 18]


In [4]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
config = AutoConfig.from_pretrained(model_name)

id2label = config.id2label
label2id = config.label2id
num_labels = config.num_labels

print(id2label)
print(label2id)
print(num_labels)

{0: 'LABEL_0', 1: 'LABEL_1'}
{'LABEL_0': 0, 'LABEL_1': 1}
2


In [6]:
encodings = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt",
        )
encodings

{'input_ids': tensor([[ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [7]:
word_ids = encodings.word_ids(batch_index=0)
print(word_ids)

[None, 0, 1, 1, 2, 3, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [8]:
encodings["input_ids"][0]

tensor([ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [9]:
aligned_labels = []
previous_word_idx = None
for word_idx in word_ids:
    if word_idx is None:
        aligned_labels.append(-100)
    elif word_idx != previous_word_idx:
        aligned_labels.append(ner_tags[word_idx])
    else:
        aligned_labels.append(-100)
    previous_word_idx = word_idx

print(aligned_labels)

[-100, 9, 10, -100, 17, 18, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [10]:
item = {
    "input_ids": encodings["input_ids"].squeeze(0),
    "attention_mask": encodings["attention_mask"].squeeze(0),
    "labels": torch.tensor(aligned_labels, dtype=torch.long)
}
print(item)

{'input_ids': tensor([ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0

In [12]:
class NERDataset(Dataset):
    def __init__(self, data_path, tokenizer, label_pad_id=-100, max_length=128):
        ## khusus untuk data json
        with open(data_path, "r", encoding="utf-8") as f:
            raw = json.load(f)["examples"]
        self.data = raw
        self.tokenizer = tokenizer
        self.label_pad_id = label_pad_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        ner_tags = self.data[idx]["ner_tags"]

        # buat encoding untuk tokens 
        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # align labels dengan tokens yang sudah diencoding (jadi kepotong2 sesuai tokenization)
        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(self.label_pad_id)
            elif word_idx != previous_word_idx:
                aligned_labels.append(ner_tags[word_idx])
            else:
                aligned_labels.append(self.label_pad_id)
            previous_word_idx = word_idx
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(aligned_labels, dtype=torch.long)
        }

        return item

In [13]:
def load_label_info(model_name):
    config = AutoConfig.from_pretrained(model_name)
    id2label = config.id2label
    label2id = config.label2id
    num_labels = config.num_labels

    label_info = {
        "id2label": id2label,
        "label2id": label2id,
        "num_labels": num_labels
    }

    return label_info

def create_dataloaders(
        train_path, val_path, test_path,
        model_name,
        batch_size=16,
        max_length=128
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = NERDataset(train_path, tokenizer, max_length=max_length)
    val_dataset = NERDataset(val_path, tokenizer, max_length=max_length)
    test_dataset = NERDataset(test_path, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [11]:
train_dataset = NERDataset(
    data_path=r"D:\Dafa\Project\queryner-kd\data\processed\train.json",
    tokenizer=tokenizer
)
train_dataset

NameError: name 'NERDataset' is not defined

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x20d240cf500>

In [None]:
batch = next(iter(train_loader))
batch

{'input_ids': tensor([[  101,  5519, 28712,  ...,     0,     0,     0],
         [  101,  1017,  3204,  ...,     0,     0,     0],
         [  101,  2148,  7363,  ...,     0,     0,     0],
         ...,
         [  101,  8731,  1004,  ...,     0,     0,     0],
         [  101,  2304,  3898,  ...,     0,     0,     0],
         [  101, 24501, 19960,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,    1,    1,  ..., -100, -100, -100],
         [-100,   13,   14,  ..., -100, -100, -100],
         [-100,    7, -100,  ..., -100, -100, -100],
         ...,
         [-100,   11,   12,  ..., -100, -100, -100],
         [-100,    3,    9,  ..., -100, -100, -100],
         [-100,   11,   12,  ..., -100, -100, -100]])}

## Model Architecture

In [None]:
from torch import nn
from torchcrf import CRF
from transformers import AutoModel, AutoConfig

In [None]:
class CRFOutputLayer(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super().__init__()
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)

    def forward(self, outputs, labels=None, mask=None):
        emissions = self.fc(outputs)

        if labels is not None:
            log_likelihood = self.crf(emissions, tags=labels, mask=mask, reduction="mean")
            loss = -log_likelihood
            return {"logits": emissions, "loss": loss}
        else:
            pred = self.crf.decode(emissions, mask=mask)
            return {"logits": emissions, "pred": pred}

In [None]:
class BaseNERModel(nn.Module):
    def __init__(self, num_labels, use_crf=False):
        super().__init__()
        self.num_labels = num_labels
        self.use_crf = use_crf

    def forward(self, input_ids, attention_mask, labels=None):
        raise NotImplementedError("Forward method must be implemented in subclass.")

In [None]:
class QueryNER(BaseNERModel):
    def __init__(self, model_name, label_info, use_crf=False):
        super().__init__(num_labels=label_info["num_labels"], use_crf=use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.config.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result

        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [None]:
class DistilBERT(BaseNERModel):
    def __init__(self, model_name="distilbert-base-uncased", label_info=None, use_crf=False):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(num_labels=self.num_labels, use_crf=self.use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}


In [None]:
teacher = QueryNER("bert-base-uncased", label_info, use_crf=False)
student = DistilBERT(label_info=label_info, use_crf=False)

In [None]:
batch = next(iter(train_loader))

input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

In [None]:
print("input_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)
print("labels shape:", labels.shape)

input_ids shape: torch.Size([16, 128])
attention_mask shape: torch.Size([16, 128])
labels shape: torch.Size([16, 128])


In [None]:
t_result = teacher(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

IndexError: Target 9 is out of bounds.

In [None]:
s_result = student(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

IndexError: index out of range in self

## CRF Trial

In [None]:
import torch
from torchcrf import CRF

In [None]:
num_tags = 5
model = CRF(num_tags)

In [None]:
seq_length = 3
batch_size = 2
emissions = torch.randn(seq_length, batch_size, num_tags)
print(emissions.shape)
print(emissions)

torch.Size([3, 2, 5])
tensor([[[-0.9194, -0.4299, -2.3956,  0.7575,  0.5029],
         [ 0.3142, -0.8432, -0.7544, -0.6260, -0.8528]],

        [[-1.1581, -0.1062, -0.3376,  0.0401, -0.3961],
         [ 1.1090, -1.3114, -0.1248, -0.4729,  0.4600]],

        [[-0.5804,  0.7643,  0.8325, -0.6142, -1.6916],
         [-0.8408, -0.5836,  0.6385, -1.1154,  1.0496]]])


In [None]:
tags = torch.tensor([
    [0, 1], [2, 4], [3, 1]], dtype=torch.long
    )

In [None]:
model(emissions, tags)

tensor(-12.1175, grad_fn=<SumBackward0>)