In [1]:
import torch
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer

In [2]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [3]:
class koBertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(koBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('skt/kobert-base-v1')
        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.dropout = nn.Dropout(self.bert.config.attention_probs_dropout_prob)

    def forward(self, input_ids, attention_mask=None):
        hidden_state = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        pooled_output = hidden_state[:, 0]  # take [CLS] token representation
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

In [4]:
model = koBertClassifier(num_labels=775)

In [5]:
model_path = "koBERT-ko-wikipedia-classifier.prm"
model.load_state_dict(torch.load(model_path))
model.to("cuda:0")  # Make sure to move the model to the desired device

koBertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [6]:
test = pd.read_csv("data/food_aging_test.csv")

In [7]:
test_list_names = test["식품오타"].values.tolist()
test_list_labels = test["label"].values.tolist()

In [8]:
words = test_list_names
labels = test_list_labels 

In [9]:
MAX_LENGTH = 100  # Adjust as needed

def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return input_ids, attention_masks, labels


test_input_ids, test_attention_masks, test_labels = tokenize_data(words, labels)

In [10]:
# Convert data to tensors
input_ids = torch.tensor(test_input_ids)
attention_masks = torch.tensor(test_attention_masks)
labels = torch.tensor(test_labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed


In [11]:
model.eval()
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = [b.to("cuda:0") for b in batch]
        
        logits = model(batch_input_ids, batch_attention_masks)
        _, predictions = torch.max(logits, dim=1)
        
        all_predictions.extend(logits.cpu().numpy())
        all_true_labels.extend(batch_labels.cpu().numpy())

In [12]:
def top_k_accuracy(preds, labels, k=1):
    """Compute top-k accuracy for predictions and labels."""
    top_k_preds = preds.topk(k, dim=1)[1]  # Get top-k predicted classes
    correct = top_k_preds.eq(labels.view(-1, 1).expand_as(top_k_preds))
    correct_k = correct.view(-1).float().sum(0, keepdim=True)
    return correct_k.item() / labels.size(0)

In [14]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=5)

  top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=5)


0.9503424657534246

In [15]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=3)

0.9434931506849316

In [16]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=1)

0.8767123287671232