In [1]:
import torch.nn as nn
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
import torch
import pandas as pd
import os
import numpy as np

In [2]:
os.environ["WANDB_DISABLED"] = "true"

In [3]:
train = pd.read_csv("data/food_aging_train.csv")
test = pd.read_csv("data/food_aging_test.csv")

In [4]:
train_list_names = train["식품오타"].values.tolist()
train_list_labels = train["label"].values.tolist()

test_list_names = test["식품오타"].values.tolist()
test_list_labels = test["label"].values.tolist()

In [5]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [6]:
words = train_list_names 
labels = train_list_labels 

In [10]:
MAX_LENGTH = 100  # Adjust as needed

def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return input_ids, attention_masks, labels

input_ids, attention_masks, labels = tokenize_data(words, labels)

In [11]:
bert = BertModel.from_pretrained('skt/kobert-base-v1')
bert.config

BertConfig {
  "_name_or_path": "skt/kobert-base-v1",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "kobert_version": 1.0,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 8002
}

In [12]:
class koBertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(koBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('skt/kobert-base-v1')
        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.dropout = nn.Dropout(self.bert.config.attention_probs_dropout_prob)

    def forward(self, input_ids, attention_mask=None):
        hidden_state = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        pooled_output = hidden_state[:, 0]  # take [CLS] token representation
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

In [13]:
num_labels = len(set(labels))
num_labels

775

In [14]:
model = koBertClassifier(num_labels=num_labels)

In [15]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Convert data to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed

# Setup GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [16]:
%%time 
# Training loop
NUM_EPOCHS = 400  # Adjust as needed

for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        logits = model(batch_input_ids, batch_attention_masks)
        
        loss = loss_fn(logits, batch_labels)
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 6.6580963134765625
Epoch: 0, Loss: 6.651524543762207
Epoch: 0, Loss: 6.665482044219971
Epoch: 0, Loss: 6.657695293426514
Epoch: 0, Loss: 6.622956275939941
Epoch: 0, Loss: 6.636784553527832
Epoch: 0, Loss: 6.652699947357178
Epoch: 0, Loss: 6.68912410736084
Epoch: 0, Loss: 6.6610846519470215
Epoch: 0, Loss: 6.620724678039551
Epoch: 0, Loss: 6.650066375732422
Epoch: 0, Loss: 6.671238422393799
Epoch: 0, Loss: 6.674171447753906
Epoch: 0, Loss: 6.644669532775879
Epoch: 0, Loss: 6.651818752288818
Epoch: 0, Loss: 6.628479957580566
Epoch: 0, Loss: 6.647660732269287
Epoch: 0, Loss: 6.6516499519348145
Epoch: 0, Loss: 6.687712669372559
Epoch: 0, Loss: 6.635111331939697
Epoch: 0, Loss: 6.650241851806641
Epoch: 0, Loss: 6.637895107269287
Epoch: 0, Loss: 6.657815933227539
Epoch: 0, Loss: 6.689864158630371
Epoch: 0, Loss: 6.661291599273682
Epoch: 0, Loss: 6.652357578277588
Epoch: 0, Loss: 6.652719497680664
Epoch: 0, Loss: 6.690530300140381
Epoch: 0, Loss: 6.66546630859375
Epoch: 0, Los

In [28]:
save_path = "koBERT-ko-wikipedia-classifier.pth"
torch.save(model.state_dict(), save_path)

In [29]:
save_path = "koBERT-ko-wikipedia-classifier.prm"
torch.save(model.state_dict(), save_path, pickle_protocol=4)