In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel
from transformers import DistilBertTokenizerFast

2023-09-08 19:19:50.320211: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class DistilBertClassifier(nn.Module):
    def __init__(self, num_labels=775):
        super(DistilBertClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.dropout = nn.Dropout(self.bert.config.attention_dropout)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]  # take [CLS] token representation
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')  #
model = DistilBertClassifier()

In [4]:
model_path = "distilBERT-ko-wikipedia-classifier.prm"
model.load_state_dict(torch.load(model_path))
model.to("cuda:0")  # Make sure to move the model to the desired device

DistilBertClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [5]:
test = pd.read_csv("data/food_aging_test.csv")

In [6]:
test_list_names = test["식품오타"].values.tolist()
test_list_labels = test["label"].values.tolist()

In [7]:
words = test_list_names
labels = test_list_labels 

In [8]:
MAX_LENGTH = 100  # Adjust as needed

def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return input_ids, attention_masks, labels


test_input_ids, test_attention_masks, test_labels = tokenize_data(words, labels)

In [9]:
# Convert data to tensors
input_ids = torch.tensor(test_input_ids)
attention_masks = torch.tensor(test_attention_masks)
labels = torch.tensor(test_labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed


In [97]:
model.eval()
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = [b.to("cuda:0") for b in batch]
        
        logits = model(batch_input_ids, batch_attention_masks)
        _, predictions = torch.max(logits, dim=1)
        
        all_predictions.extend(logits.cpu().numpy())
        all_true_labels.extend(batch_labels.cpu().numpy())

In [100]:
def top_k_accuracy(preds, labels, k=1):
    """Compute top-k accuracy for predictions and labels."""
    top_k_preds = preds.topk(k, dim=1)[1]  # Get top-k predicted classes
    correct = top_k_preds.eq(labels.view(-1, 1).expand_as(top_k_preds))
    correct_k = correct.view(-1).float().sum(0, keepdim=True)
    return correct_k.item() / labels.size(0)

In [106]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=5)

0.6952054794520548

In [107]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=3)

0.6832191780821918

In [108]:
top_k_accuracy(torch.tensor(all_predictions), torch.tensor(all_true_labels), k=1)

0.6301369863013698