In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer

2023-09-10 04:44:36.667451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["WANDB_DISABLED"] = "true"

In [3]:
train = pd.read_csv("data/food_aging_train.csv")
test = pd.read_csv("data/food_aging_test.csv")

In [4]:
train_list_names = train["식품오타"].values.tolist()
train_list_labels = train["label"].values.tolist()

test_list_names = test["식품오타"].values.tolist()
test_list_labels = test["label"].values.tolist()

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("eaglewatch/gpt2-ko-wikipedia")

In [6]:
words = train_list_names 
labels = train_list_labels 

In [10]:
MAX_LENGTH = 100  # Adjust as needed

def tokenize_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return input_ids, attention_masks, labels

input_ids, attention_masks, labels = tokenize_data(words, labels)

In [12]:
class GPT2Classifier(nn.Module):
    def __init__(self, model_name, num_labels=775):
        super(GPT2Classifier, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.1)
        self.out = nn.Linear(self.gpt2.config.vocab_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs[0]  # Here's the change. Use the first item in the tuple.
        output = hidden_states[:, 0, :]
        output = self.drop(output)
        logits = self.out(output)
        return logits

In [13]:
num_labels = len(set(labels))
num_labels

775

In [14]:
model = GPT2Classifier("eaglewatch/gpt2-ko-wikipedia", num_labels=num_labels)

In [15]:
# Convert data to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=16)  # Adjust batch size as needed

# Setup GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [16]:
%%time 
# Training loop
NUM_EPOCHS = 400  # Adjust as needed

for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        batch_input_ids, batch_attention_masks, batch_labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        logits = model(batch_input_ids, batch_attention_masks)
        
        loss = loss_fn(logits, batch_labels)
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 7.135964870452881
Epoch: 0, Loss: 6.733551979064941
Epoch: 0, Loss: 6.597350120544434
Epoch: 0, Loss: 6.82240629196167
Epoch: 0, Loss: 7.245760440826416
Epoch: 0, Loss: 7.263490200042725
Epoch: 0, Loss: 7.163374900817871
Epoch: 0, Loss: 7.091363906860352
Epoch: 0, Loss: 7.367548942565918
Epoch: 0, Loss: 6.974165439605713
Epoch: 0, Loss: 6.592055320739746
Epoch: 0, Loss: 7.368448257446289
Epoch: 0, Loss: 7.309906482696533
Epoch: 0, Loss: 7.314796447753906
Epoch: 0, Loss: 7.268977165222168
Epoch: 0, Loss: 7.080490589141846
Epoch: 0, Loss: 7.424739837646484
Epoch: 0, Loss: 7.645434856414795
Epoch: 0, Loss: 7.6883158683776855
Epoch: 0, Loss: 7.481388568878174
Epoch: 0, Loss: 6.996306896209717
Epoch: 0, Loss: 6.713103771209717
Epoch: 0, Loss: 6.987492084503174
Epoch: 0, Loss: 7.76828145980835
Epoch: 0, Loss: 7.789834499359131
Epoch: 0, Loss: 7.740433216094971
Epoch: 0, Loss: 7.517552375793457
Epoch: 0, Loss: 7.40606164932251
Epoch: 0, Loss: 8.034395217895508
Epoch: 0, Loss: 

In [17]:
save_path = "gpt2-ko-wikipedia-classifier.pth"
torch.save(model.state_dict(), save_path)

In [18]:
save_path = "gpt2-ko-wikipedia-classifier.prm"
torch.save(model.state_dict(), save_path, pickle_protocol=4)