In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import MeCab

tagger = MeCab.Tagger("-Owakati")


def tokenizer(text):
    text = tagger.parse(text)
    return text.split()


tokenizer("私は日本人です。")

['私', 'は', '日本人', 'です', '。']

In [3]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer)
LABEL = data.Field(sequential=False, use_vocab=False)

train_dataset, val_dataset, test_dataset = data.TabularDataset.splits(
    path="./data",
    train="train.tsv",
    validation="val.tsv",
    test="test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=True,
)

vars(train_dataset[0])

{'text': ['ぼけ',
  'っと',
  'し',
  'て',
  'たら',
  'こんな',
  '時間',
  '｡',
  'チャリ',
  'ある',
  'から',
  '食べ',
  'に',
  'で',
  'たい',
  'のに',
  '…'],
 'label': '0'}

In [4]:
TEXT.build_vocab(train_dataset, min_freq=1)

In [5]:
batch_size = 32

train_iter = data.Iterator(
    train_dataset, batch_size=batch_size, device=device, train=True
)

val_iter = data.Iterator(
    val_dataset, batch_size=batch_size, device=device, train=False, sort=False
)

test_iter = data.Iterator(
    test_dataset, batch_size=batch_size, device=device, train=False, sort=False
)

batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

tensor([[ 2190,   163, 23390,  ...,  1371,   394, 36898],
        [ 1697,   175,     6,  ...,    35,     2,  1409],
        [ 3997,   285, 13875,  ...,    27,   457,   942],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])
tensor([0, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0, 1, 1, 0, 2, 0, 1, 0, 0, 0, 3, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 1])


In [6]:
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        output = self.embedding(text)
        output, (hn, cn) = self.lstm(output)
        output = self.linear(output[-1])
        return output

In [7]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        outputs = model(batch.text)
        loss = criterion(outputs, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [8]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for batch in iterator:
            outputs = model(batch.text)
            loss = criterion(outputs, batch.label)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [9]:
model = LSTM(len(TEXT.vocab), 32, 32, 4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train(model, train_iter, optimizer, criterion)
    val_loss = evaluate(model, val_iter, criterion)
    print(f"Epoch {epoch + 1:>2}/{num_epochs}", end=" ")
    print(f"train_loss {train_loss:.4f}", end=" ")
    print(f"val_loss {val_loss:.4f}")

Epoch  1/10 train_loss 1.1825 val_loss 1.0054
Epoch  2/10 train_loss 0.9204 val_loss 0.9713
Epoch  3/10 train_loss 0.9117 val_loss 0.9658
Epoch  4/10 train_loss 0.9088 val_loss 0.9603
Epoch  5/10 train_loss 0.9061 val_loss 0.9572
Epoch  6/10 train_loss 0.9033 val_loss 0.9541
Epoch  7/10 train_loss 0.8994 val_loss 0.9529
Epoch  8/10 train_loss 0.8963 val_loss 0.9476
Epoch  9/10 train_loss 0.8950 val_loss 0.9480
Epoch 10/10 train_loss 0.8946 val_loss 0.9477


In [10]:
from sklearn.metrics import mean_absolute_error

model.eval()
joy_true = []
joy_pred = []

with torch.no_grad():
    for batch in test_iter:
        outputs = model(batch.text)
        _, preds = torch.max(outputs, 1)
        joy_true += batch.label.cpu()
        joy_pred += preds.cpu()

mean_absolute_error(joy_true, joy_pred)

0.4245