## Data Preparation

In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

In [11]:
def getMinUpgradationTime(req1, t1, req2, t2):
    # Find minimum time T where both services have completed their required upgrades
    # Service 1 needs t1 upgrade opportunities (time % req1 != 0)
    # Service 2 needs t2 upgrade opportunities (time % req2 != 0)
    
    count1 = 0  # upgrade opportunities for service 1
    count2 = 0  # upgrade opportunities for service 2
    time = 1
    
    while count1 < t1 or count2 < t2:
        if time % req1 != 0:
            count1 += 1
        if time % req2 != 0:
            count2 += 1
        
        if count1 >= t1 and count2 >= t2:
            return time
        
        time += 1
    
    return time

In [25]:
getMinUpgradationTime(3, 2, 4, 1)

[1, 2]
[1]


2

In [26]:

# Let's trace through test 1 manually to understand the pattern
# Test 1: req1=3, t1=2, req2=4, t2=1, expected=3

print("Manual trace for test 1: (3, 2, 4, 1)")
print("Service 1 can upgrade when time % 3 != 0")
print("Service 2 can upgrade when time % 4 != 0")
print()

# Generate more times to understand the pattern
arr1_full = []
time = 1
while len(arr1_full) < 5:
    if time % 3 != 0:
        arr1_full.append(time)
    time += 1

arr2_full = []
time = 1
while len(arr2_full) < 5:
    if time % 4 != 0:
        arr2_full.append(time)
    time += 1

print(f"Service 1 first 5 upgrade times: {arr1_full}")
print(f"Service 2 first 5 upgrade times: {arr2_full}")
print(f"For t1=2, t2=1: we need arr1_full[0:2]={arr1_full[0:2]} and arr2_full[0:1]={arr2_full[0:1]}")


Manual trace for test 1: (3, 2, 4, 1)
Service 1 can upgrade when time % 3 != 0
Service 2 can upgrade when time % 4 != 0

Service 1 first 5 upgrade times: [1, 2, 4, 5, 7]
Service 2 first 5 upgrade times: [1, 2, 3, 5, 6]
For t1=2, t2=1: we need arr1_full[0:2]=[1, 2] and arr2_full[0:1]=[1]


In [27]:

# Detailed trace for test 1
print("\nDetailed trace for (3, 2, 4, 1):")
print("Finding min T where req1=3 has >=2 chances AND req2=4 has >=1 chance\n")

count1 = 0
count2 = 0
for t in range(1, 10):
    if t % 3 != 0:
        count1 += 1
    if t % 4 != 0:
        count2 += 1
    print(f"Time {t}: S1_count={count1}, S2_count={count2}")
    if count1 >= 2 and count2 >= 1:
        print(f"✓ Both requirements met at time {t}")
        break



Detailed trace for (3, 2, 4, 1):
Finding min T where req1=3 has >=2 chances AND req2=4 has >=1 chance

Time 1: S1_count=1, S2_count=1
Time 2: S1_count=2, S2_count=2
✓ Both requirements met at time 2


In [28]:

# Trace test 2: (2, 1, 3, 3)
print("\nTrace for (2, 1, 3, 3), expected 4:")
count1 = 0
count2 = 0
for t in range(1, 10):
    if t % 2 != 0:
        count1 += 1
    if t % 3 != 0:
        count2 += 1
    print(f"Time {t}: S1_count={count1}, S2_count={count2}")
    if count1 >= 1 and count2 >= 3:
        print(f"✓ Found at time {t}")
        break



Trace for (2, 1, 3, 3), expected 4:
Time 1: S1_count=1, S2_count=1
Time 2: S1_count=1, S2_count=2
Time 3: S1_count=2, S2_count=2
Time 4: S1_count=2, S2_count=3
✓ Found at time 4


### Steps

In [132]:
path = r"D:\Dafa\Project\queryner-kd\data\processed\train.json"
with open(path, "r", encoding="utf-8") as f:
    raw = json.load(f)["examples"]

print(len(raw))
for item in raw[:2]:
    print(item)

7823
{'tokens': ['teeth', 'whitening', 'sensitive', 'teeth'], 'ner_tags': [9, 10, 17, 18]}
{'tokens': ['white', 'duvet', 'cover', 'queen'], 'ner_tags': [3, 9, 10, 1]}


In [133]:
tokens = raw[0]["tokens"]
ner_tags = raw[0]["ner_tags"]
print(tokens)
print(ner_tags)

['teeth', 'whitening', 'sensitive', 'teeth']
[9, 10, 17, 18]


In [134]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [135]:
encodings = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt",
        )
encodings

{'input_ids': tensor([[ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [136]:
word_ids = encodings.word_ids(batch_index=0)
print(word_ids)

[None, 0, 1, 1, 2, 3, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [137]:
encodings["input_ids"][0]

tensor([ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [138]:
aligned_labels = []
previous_word_idx = None
for word_idx in word_ids:
    if word_idx is None:
        aligned_labels.append(-100)
    elif word_idx != previous_word_idx:
        aligned_labels.append(ner_tags[word_idx])
    else:
        aligned_labels.append(-100)
    previous_word_idx = word_idx

print(aligned_labels)

[-100, 9, 10, -100, 17, 18, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [139]:
item = {
    "input_ids": encodings["input_ids"].squeeze(0),
    "attention_mask": encodings["attention_mask"].squeeze(0),
    "labels": torch.tensor(aligned_labels, dtype=torch.long)
}
print(item)

{'input_ids': tensor([ 101, 4091, 2317, 5582, 7591, 4091,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0

In [140]:
class NERDataset(Dataset):
    def __init__(self, data_path, tokenizer, label_pad_id=-100, max_length=128):
        ## khusus untuk data json
        with open(data_path, "r", encoding="utf-8") as f:
            raw = json.load(f)["examples"]
        self.data = raw
        self.tokenizer = tokenizer
        self.label_pad_id = label_pad_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        ner_tags = self.data[idx]["ner_tags"]

        # buat encoding untuk tokens 
        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # align labels dengan tokens yang sudah diencoding (jadi kepotong2 sesuai tokenization)
        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(self.label_pad_id)
            elif word_idx != previous_word_idx:
                aligned_labels.append(ner_tags[word_idx])
            else:
                aligned_labels.append(self.label_pad_id)
            previous_word_idx = word_idx
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(aligned_labels, dtype=torch.long)
        }

        return item

In [141]:
def load_label_info(model_name):
    config = AutoConfig.from_pretrained(model_name)
    id2label = config.id2label
    label2id = config.label2id
    num_labels = config.num_labels

    label_info = {
        "id2label": id2label,
        "label2id": label2id,
        "num_labels": num_labels
    }

    return label_info

def create_dataloaders(
        train_path, val_path, test_path,
        model_name,
        batch_size=16,
        max_length=128
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = NERDataset(train_path, tokenizer, max_length=max_length)
    val_dataset = NERDataset(val_path, tokenizer, max_length=max_length)
    test_dataset = NERDataset(test_path, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [142]:
train_dataset = NERDataset(
    data_path=r"D:\Dafa\Project\queryner-kd\data\processed\train.json",
    tokenizer=tokenizer
)
train_dataset

<__main__.NERDataset at 0x1e09721a5a0>

In [143]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1e094517950>

In [144]:
batch = next(iter(train_loader))
batch

{'input_ids': tensor([[  101,  8441,  3345,  ...,     0,     0,     0],
         [  101, 18144, 25309,  ...,     0,     0,     0],
         [  101, 13653,  9242,  ...,     0,     0,     0],
         ...,
         [  101, 11584,  3514,  ...,     0,     0,     0],
         [  101,  3680, 11527,  ...,     0,     0,     0],
         [  101, 16215,  9307,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,   31,   32,  ..., -100, -100, -100],
         [-100,    9,   10,  ..., -100, -100, -100],
         [-100,   11,   31,  ..., -100, -100, -100],
         ...,
         [-100,   17,    9,  ..., -100, -100, -100],
         [-100,   11,   12,  ..., -100, -100, -100],
         [-100,   11, -100,  ..., -100, -100, -100]])}

### Run

In [145]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig

In [146]:
class NERDataset(Dataset):
    def __init__(self, data_path, tokenizer, label_pad_id=-100, max_length=128):
        with open(data_path, "r", encoding="utf-8") as f:
            raw = json.load(f)["examples"]
        self.data = raw
        self.tokenizer = tokenizer
        self.label_pad_id = label_pad_id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"]
        ner_tags = self.data[idx]["ner_tags"]

        # buat encoding untuk tokens 
        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # align labels dengan tokens yang sudah diencoding (jadi kepotong2 sesuai tokenization)
        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(self.label_pad_id)
            elif word_idx != previous_word_idx:
                aligned_labels.append(ner_tags[word_idx])
            else:
                aligned_labels.append(self.label_pad_id)
            previous_word_idx = word_idx
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(aligned_labels, dtype=torch.long)
        }

        return item

In [147]:
def load_label_info(model_name):
    config = AutoConfig.from_pretrained(model_name)
    id2label = config.id2label
    label2id = config.label2id
    num_labels = config.num_labels

    label_info = {
        "id2label": id2label,
        "label2id": label2id,
        "num_labels": num_labels
    }

    return label_info

def create_dataloaders(
        train_path, val_path, test_path,
        model_name,
        batch_size=32,
        max_length=128
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = NERDataset(train_path, tokenizer, max_length=max_length)
    val_dataset = NERDataset(val_path, tokenizer, max_length=max_length)
    test_dataset = NERDataset(test_path, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [148]:
train_loader, val_loader, test_loader = create_dataloaders(
    train_path=r"D:\Dafa\Project\queryner-kd\data\processed\train.json",
    val_path=r"D:\Dafa\Project\queryner-kd\data\processed\validation.json",
    test_path=r"D:\Dafa\Project\queryner-kd\data\processed\test.json",
    model_name="bert-base-uncased",
    batch_size=16,
    max_length=128
)

label_info = load_label_info("bltlab/queryner-augmented-data-bert-base-uncased")

In [149]:
batch = next(iter(train_loader))
batch

{'input_ids': tensor([[  101,  2273,  2015,  ...,     0,     0,     0],
         [  101, 21299,  9949,  ...,     0,     0,     0],
         [  101, 12745, 19958,  ...,     0,     0,     0],
         ...,
         [  101, 11608,  4524,  ...,     0,     0,     0],
         [  101,  1046, 16558,  ...,     0,     0,     0],
         [  101, 22894,  9541,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,   13, -100,  ..., -100, -100, -100],
         [-100,   11,   17,  ..., -100, -100, -100],
         [-100,   17,    9,  ..., -100, -100, -100],
         ...,
         [-100,   11,    9,  ..., -100, -100, -100],
         [-100,   11, -100,  ..., -100, -100, -100],
         [-100,   11, -100,  ..., -100, -100, -100]])}

In [150]:
label_info.keys()

dict_keys(['id2label', 'label2id', 'num_labels'])

In [151]:
from torch import nn
from torchcrf import CRF
from transformers import AutoModel, AutoConfig

In [152]:
# BaseNERModel
label_info = label_info
use_crf = False

# QueryNER
model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=label_info["num_labels"],
    id2label=label_info["id2label"],
    label2id=label_info["label2id"]
)

bert = AutoModel.from_pretrained(model_name, config=config)
dropout = nn.Dropout(0.1)
classifier = nn.Linear(bert.config.hidden_size, bert.config.num_labels)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

In [153]:
# forward
outputs = bert(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
outputs.last_hidden_state.shape  # (batch_size, seq_len, hidden_size)

torch.Size([16, 128, 768])

In [154]:
sequence_output = dropout(outputs.last_hidden_state)
sequence_output.shape

torch.Size([16, 128, 768])

#### Use CRF False

In [155]:
logits = classifier(sequence_output)  # (batch_size, seq_len, num_labels)
logits.shape

torch.Size([16, 128, 35])

In [156]:
loss = loss_fn(
    logits.view(-1, label_info["num_labels"]),
    batch["labels"].view(-1)
)
loss

tensor(3.6777, grad_fn=<NllLossBackward0>)

#### Use CRF True

In [157]:
# self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
# result = self.crf_output(sequence_output, labels=labels, mask=mask)

In [158]:
from torchcrf import CRF
# input nya sequence_output, labels, mask
fc = nn.Linear(bert.config.hidden_size, bert.config.num_labels)
crf = CRF(label_info["num_labels"], batch_first=True)

In [159]:
emissions = fc(sequence_output)  # (batch_size, seq_len, num_labels)
emissions.shape

torch.Size([16, 128, 35])

In [160]:
batch["labels"] is not None

True

In [161]:
batch["labels"]

tensor([[-100,   13, -100,  ..., -100, -100, -100],
        [-100,   11,   17,  ..., -100, -100, -100],
        [-100,   17,    9,  ..., -100, -100, -100],
        ...,
        [-100,   11,    9,  ..., -100, -100, -100],
        [-100,   11, -100,  ..., -100, -100, -100],
        [-100,   11, -100,  ..., -100, -100, -100]])

In [162]:
mask = batch["attention_mask"]
mask = mask.bool()
mask[:, 0] = True

In [163]:
labels_crf = batch["labels"]
labels_crf[batch["labels"] == -100] = 0
labels_crf

tensor([[ 0, 13,  0,  ...,  0,  0,  0],
        [ 0, 11, 17,  ...,  0,  0,  0],
        [ 0, 17,  9,  ...,  0,  0,  0],
        ...,
        [ 0, 11,  9,  ...,  0,  0,  0],
        [ 0, 11,  0,  ...,  0,  0,  0],
        [ 0, 11,  0,  ...,  0,  0,  0]])

In [164]:
log_likehood = crf(emissions, batch["labels"], mask=batch["attention_mask"].bool(), reduction='token_mean')
loss = -log_likehood
loss

tensor(3.5357, grad_fn=<NegBackward0>)

In [165]:
pred = crf.decode(emissions, mask=batch["attention_mask"].bool())
pred

[[23, 7, 3, 25, 25, 29, 25, 6],
 [2, 7, 2, 1, 22, 3, 4],
 [23, 0, 0, 3],
 [23, 23, 32, 2, 32, 1, 1, 23, 21],
 [17, 2, 25, 1, 19, 14, 11, 25],
 [23, 20, 29, 33, 18, 1, 28, 29],
 [17, 32, 23, 14, 28, 6],
 [2, 20, 13, 9],
 [27, 6, 13, 26, 29, 27, 1, 29, 26],
 [17, 2, 27, 3, 13, 3, 29, 3],
 [23, 6, 28, 13, 1, 28],
 [25, 27, 13, 25, 29, 3],
 [30, 13, 23, 13, 21],
 [30, 11, 20, 6, 5],
 [30, 2, 11, 0, 28],
 [23, 2, 27, 23, 27, 23, 3]]

## Model Architecture

### Class

In [166]:
from torch import nn
from torchcrf import CRF
from transformers import AutoModel, AutoConfig

In [167]:
class CRFOutputLayer(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super().__init__()
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_tags=num_labels, batch_first=True)

    def forward(self, outputs, labels=None, mask=None):
        emissions = self.fc(outputs)

        if labels is not None:
            if mask is None:
                mask = torch.ones_like(labels, dtype=torch.bool)
            else:
                mask = mask.bool()
            
            mask[:, 0] = True
            
            labels_crf = labels.clone()
            labels_crf[labels == -100] = 0
            
            log_likelihood = self.crf(emissions, tags=labels_crf, mask=mask, reduction="token_mean")
            loss = -log_likelihood
            return {"logits": emissions, "loss": loss}
        else:
            if mask is None:
                mask = torch.ones(outputs.shape[:2], dtype=torch.bool, device=outputs.device)
            pred = self.crf.decode(emissions, mask=mask.bool())
            return {"logits": emissions, "pred": pred}

In [168]:
class BaseNERModel(nn.Module):
    def __init__(self, num_labels, use_crf=False):
        super().__init__()
        self.num_labels = num_labels
        self.use_crf = use_crf

    def forward(self, input_ids, attention_mask, labels=None):
        raise NotImplementedError("Forward method must be implemented in subclass.")

In [169]:
class QueryNERTeacher(BaseNERModel):
    def __init__(self, model_name, label_info, use_crf=False):
        super().__init__(num_labels=label_info["num_labels"], use_crf=use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.3)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.config.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result

        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [170]:
class DistilBERTStudent(BaseNERModel):
    def __init__(self, model_name="distilbert-base-uncased", label_info=None, use_crf=False):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(num_labels=self.num_labels, use_crf=self.use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.3)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [171]:
class TinyBertStudent(BaseNERModel):
    def __init__(self, model_name="huawei-noah/TinyBERT_General_4L_312D", label_info=None, use_crf=False):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(num_labels=self.num_labels, use_crf=self.use_crf)

        self.config = AutoConfig.from_pretrained(
            model_name,
            num_labels=label_info["num_labels"],
            id2label=label_info["id2label"],
            label2id=label_info["label2id"]
        )

        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.3)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(self.config.hidden_size, self.num_labels)
        else:
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

In [172]:
class BiLSTMStudent(BaseNERModel):
    def __init__(
            self,  
            use_crf=False,
            model_name_for_vocab = 'bert-base-uncased',
            emb_dim = 300,
            lstm_hidden = 300,
            label_info = None,
            pad_token_id = 0,
            teacher_model = None
        ):
        self.use_crf = use_crf
        self.num_labels = label_info["num_labels"]
        super().__init__(self.num_labels, use_crf)

        config = AutoConfig.from_pretrained(model_name_for_vocab)
        vocab_size = config.vocab_size
        pad_token_id = config.pad_token_id

        if teacher_model is not None:
            self.embedding = teacher_model.bert.embeddings.word_embeddings
            for p in self.embedding.parameters():
                p.requires_grad = False
            emb_dim_eff = self.embedding.embedding_dim
        else:
            emb_dim_eff = emb_dim
            self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_token_id)
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(
            input_size=emb_dim_eff,
            hidden_size=lstm_hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Linear(lstm_hidden * 2, self.num_labels)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

        if self.use_crf:
            self.crf_output = CRFOutputLayer(hidden_dim=lstm_hidden * 2, num_labels=self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        emb = self.embedding(input_ids)
        emb = self.dropout(emb)
        outputs, _ = self.lstm(emb)
        sequence_output = outputs

        if self.use_crf:
            mask = attention_mask.bool()
            result = self.crf_output(sequence_output, labels=labels, mask=mask)
            return result
        else:
            logits = self.classifier(sequence_output)
            if labels is not None:
                loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
                return {"logits": logits, "loss": loss}
            else:
                pred = logits.argmax(dim=-1)
                return {"logits": logits, "pred": pred}

### Run

In [173]:
import torch.nn.functional as F

In [174]:
teacher = QueryNERTeacher("bert-base-uncased", label_info, use_crf=False)
student = DistilBERTStudent(label_info=label_info, use_crf=False)

In [175]:
batch = next(iter(train_loader))

input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

In [176]:
print("input_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)
print("labels shape:", labels.shape)

input_ids shape: torch.Size([16, 128])
attention_mask shape: torch.Size([16, 128])
labels shape: torch.Size([16, 128])


In [177]:
t_result = teacher(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
print(t_result["logits"].shape)
print(t_result["logits"][:1, :1])
print(t_result["loss"])

torch.Size([16, 128, 35])
tensor([[[ 0.5233,  0.1475,  0.6589,  0.1023, -0.3250, -0.0386,  0.3790,
          -0.1430,  0.4473,  0.1549, -0.5810,  0.0186, -0.6553,  0.1660,
          -0.1445, -0.2970, -0.1980,  0.3350, -0.2192, -1.0898, -0.0928,
          -0.1468,  0.1173, -0.2140, -0.8575,  0.0469,  0.2112,  0.1077,
          -0.1394,  0.3507, -0.0735,  0.3273, -0.3279, -0.5097, -0.8232]]],
       grad_fn=<SliceBackward0>)
tensor(3.8230, grad_fn=<NllLossBackward0>)


In [178]:
s_result = student(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
s_result.keys()

dict_keys(['logits', 'loss'])

#### KL Div

In [179]:
T = 2.0

In [180]:
p_teacher = F.log_softmax(t_result["logits"]/T, dim=-1)
# print("T2: ", p_teacher[:1, :1])
p_student = F.softmax(s_result["logits"] / T, dim=-1)
# print("Student", p_student[:1, :1])

In [203]:
loss = F.kl_div(p_teacher, p_student, reduction="batchmean")
loss = loss * (2**2)
loss

tensor(8.3002, grad_fn=<MulBackward0>)

#### KL Div Masked

In [182]:
kl_elem = F.kl_div(p_teacher, p_student, reduction="none")
kl_elem.shape

torch.Size([16, 128, 35])

In [183]:
kl_token = kl_elem.sum(dim=-1)
# dim = -1 > the summation working on the last dimension (B,L,C) > sum for each class
kl_token.shape

torch.Size([16, 128])

In [192]:
mask = batch["attention_mask"]
mask = mask.bool()
mask.float().shape

torch.Size([16, 128])

In [186]:
valid_sum = mask.float().sum()
valid_sum

tensor(107.)

In [202]:
if valid_sum == 0:
    print(torch.tensor(0.0, device=s_result["logits"].device))
kl_sum = (kl_token * mask.float()).sum()
print((kl_sum/valid_sum)*(T*T))

tensor(0.1099, grad_fn=<MulBackward0>)


In [201]:
kl_token.mean()*(T*T)

tensor(0.0648, grad_fn=<MulBackward0>)

### Loss Function

In [None]:
import torch.nn.functional as F

In [None]:
for T in [0.5, 1, 2, 4, 8]:
    softmax_res = F.softmax(logits/T, dim=-1)
    print(f"With T={T}:\n" + str(softmax_res[:1, :1, :10]))

With T=0.5:
tensor([[[0.0314, 0.0862, 0.0725, 0.0089, 0.0369, 0.0183, 0.0583, 0.0331,
          0.0164, 0.0687]]], grad_fn=<SliceBackward0>)
With T=1:
tensor([[[0.0317, 0.0525, 0.0482, 0.0169, 0.0344, 0.0242, 0.0432, 0.0325,
          0.0229, 0.0469]]], grad_fn=<SliceBackward0>)
With T=2:
tensor([[[0.0305, 0.0393, 0.0376, 0.0223, 0.0318, 0.0267, 0.0356, 0.0309,
          0.0259, 0.0371]]], grad_fn=<SliceBackward0>)
With T=4:
tensor([[[0.0296, 0.0336, 0.0329, 0.0253, 0.0303, 0.0277, 0.0320, 0.0298,
          0.0273, 0.0327]]], grad_fn=<SliceBackward0>)
With T=8:
tensor([[[0.0291, 0.0310, 0.0307, 0.0269, 0.0294, 0.0282, 0.0303, 0.0292,
          0.0280, 0.0306]]], grad_fn=<SliceBackward0>)


In [None]:

F.kl_div()

## CRF Trial

In [None]:
import torch
from torchcrf import CRF

In [None]:
num_tags = 5
model = CRF(num_tags)

In [None]:
seq_length = 3
batch_size = 2
emissions = torch.randn(seq_length, batch_size, num_tags)
print(emissions.shape)
print(emissions)

torch.Size([3, 2, 5])
tensor([[[-0.9194, -0.4299, -2.3956,  0.7575,  0.5029],
         [ 0.3142, -0.8432, -0.7544, -0.6260, -0.8528]],

        [[-1.1581, -0.1062, -0.3376,  0.0401, -0.3961],
         [ 1.1090, -1.3114, -0.1248, -0.4729,  0.4600]],

        [[-0.5804,  0.7643,  0.8325, -0.6142, -1.6916],
         [-0.8408, -0.5836,  0.6385, -1.1154,  1.0496]]])


In [None]:
tags = torch.tensor([
    [0, 1], [2, 4], [3, 1]], dtype=torch.long
    )

In [None]:
model(emissions, tags)

tensor(-12.1175, grad_fn=<SumBackward0>)