In [14]:
# 목표 : torchtext 라이브러리로 텍스트 분류
# 데이터 준비
import torch
from torchtext.datasets import AG_NEWS

In [15]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer("basic_english")
train_iter, test_iter = AG_NEWS()

In [16]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text) 
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>","<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [17]:
# 텍스트 > 정수 인코딩
text_pipeline = lambda x: vocab(tokenizer(x))

# 레이블 > 정수 인코딩
label_pipeline = lambda x: int(x) - 1

In [18]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list, text_list, offsets

In [20]:
type(vocab)

torchtext.vocab.vocab.Vocab

In [21]:
# https://pytorch.org/text/stable/vocab.html
# 역시 문서 찾는게 AI보다 낫다....
for index, token in enumerate(vocab.get_itos()):
    print(f"Index: {index}, Token: {token}")
    break

Index: 0, Token: <unk>


In [22]:
token_to_id = {token: idx for idx, token in enumerate(vocab.get_itos())}
id_to_token = {idx: token for idx, token in enumerate(vocab.get_itos())}

In [33]:
import numpy as np

def pad_sequences(sequences, max_length, pad_value, start="R"):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length] if start == "R" else sequence[-1*max_length:]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length if start == "R" else  [pad_value] * pad_length + sequence
        result.append(padded_sequence)
    return np.asarray(result)

unk_id = token_to_id["<unk>"]

train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for _, review in train_iter
]
test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for _, review in test_iter
]



[0,
 6,
 1868,
 1868,
 0,
 0,
 84,
 2,
 0,
 0,
 2884,
 6,
 1995,
 10,
 0,
 0,
 1868,
 6,
 1091,
 0,
 0,
 6,
 987,
 3369,
 0,
 0,
 138,
 84,
 372,
 0,
 84,
 2960,
 2884,
 0,
 0,
 1868,
 6,
 987,
 3369,
 0,
 14,
 0,
 2884,
 52,
 84,
 2884,
 1995,
 10,
 15,
 0,
 0,
 2884,
 52,
 84,
 2884,
 1995,
 10,
 0,
 16,
 0,
 0,
 2960,
 372,
 1995,
 84,
 16,
 10,
 2884,
 1868,
 1868,
 2884,
 1995,
 10,
 4,
 0,
 0,
 6,
 1868,
 1868,
 0,
 0,
 84,
 1995,
 2884,
 2884,
 84,
 17,
 10,
 0,
 902,
 1091,
 283,
 138,
 902,
 1868,
 283,
 138,
 3366,
 1705,
 2207,
 6,
 138,
 902,
 0,
 372,
 2339,
 0,
 52,
 1868,
 84,
 1995,
 6,
 16,
 987,
 5001,
 138,
 283,
 987,
 10,
 4,
 0,
 6,
 1995,
 2884,
 0,
 10,
 2884,
 2884,
 283,
 138,
 3366,
 0,
 3366,
 1995,
 2884,
 2884,
 138,
 0,
 6,
 3366,
 6,
 283,
 138,
 2]

In [34]:
max_length = 32
pad_id = token_to_id["<pad>"]

train_ids = pad_sequences(train_ids, max_length, pad_id, start="R")
test_ids = pad_sequences(test_ids, max_length, pad_id, start="R")

print(train_ids[0])
print(test_ids[0])

[   0    6 1868 1868    0    0   84    2    0    0 2884    6 1995   10
    0    0 1868    6 1091    0    0    6  987 3369    0    0  138   84
  372    0   84 2960]
[   0 2884    6 1995   10    0 2339  372 1995    0    0    0    0    0
  954 2884  138   10  283  372  138    0    6 2339   84 2884 1995    0
   84    6 1868 3369]


In [37]:
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)


  train_ids = torch.tensor(train_ids)
  test_ids = torch.tensor(test_ids)


In [50]:

train_labels = torch.tensor([label-1 for label, _ in train_iter], dtype=torch.float32)
test_labels = torch.tensor([label-1 for label, _ in test_iter], dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [51]:
from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm"
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 4)
        else:
            self.classifier = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [52]:
from torch import optim
import torch.nn as nn

n_vocab = len(token_to_id)
hidden_dim = 64 
embedding_dim = 128
n_layers = 2

device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier( 
    n_vocab=n_vocab, 
    hidden_dim=hidden_dim, 
    embedding_dim=embedding_dim, 
    n_layers=n_layers
).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(classifier.parameters(), lr=0.001)  # 학습률이 빠른 AdamW로 옵티마이저 설정 

In [54]:
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()
    corrects = 0
    total = 0

    for step, (input_ids, labels) in enumerate(datasets): 
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        logits = model(input_ids)
        loss = criterion(logits, labels.long())
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(logits, 1)
        corrects += (predicted == labels).sum().item()
        total += labels.size(0)

        if step % interval == 0:
            accuracy = corrects / total
            print(f"Train Loss {step} : {np.mean(losses)}, Train Accuracy : {accuracy}")


epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
# 힘내라 내 모델아.... 

Train Loss 0 : 1.3392443656921387, Train Accuracy : 0.4375


Train Loss 500 : 1.3809785226623932, Train Accuracy : 0.28642714570858285
Train Loss 1000 : 1.379379356300438, Train Accuracy : 0.29164585414585414
Train Loss 1500 : 1.3706254418256838, Train Accuracy : 0.30654563624250497
Train Loss 2000 : 1.3597825100158585, Train Accuracy : 0.3241816591704148
Train Loss 2500 : 1.346627144802098, Train Accuracy : 0.3398890443822471
Train Loss 3000 : 1.3327666244003147, Train Accuracy : 0.3551316227924025
Train Loss 3500 : 1.3192014002146226, Train Accuracy : 0.3680734075978292
Train Loss 4000 : 1.3079322463898204, Train Accuracy : 0.37940514871282177
Train Loss 4500 : 1.295165769419811, Train Accuracy : 0.39130193290379917
Train Loss 5000 : 1.2826194867709235, Train Accuracy : 0.40293191361727654
Train Loss 5500 : 1.271259783636633, Train Accuracy : 0.4124704599163788
Train Loss 6000 : 1.2588925342562993, Train Accuracy : 0.42210673221129813
Train Loss 6500 : 1.246810514144651, Train Accuracy : 0.43127018920166127
Train Loss 7000 : 1.2349359751547972

In [None]:
# 55m 30.7s...많이도 걸렸다!
# 근데 빅분기 과락나왔다...큰일났다 