In [1]:
import polars as pl
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 学習データからテキストのみをリストで抽出
FILE_PATH = "../第6章/news+aggregator/train.txt"
df = pl.read_csv(FILE_PATH, separator="\t", new_columns=["text", "categoory"])
text_list = df["text"].to_list()

#　記号を取り除く
cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in text_list]
word_list = [text.split() for text in cleaned_text_list]
word_list
#　記号を取り除く
word_frequency_dict = {}
# 単語リストをループして出現頻度をカウント
for words in word_list:
    for word in words:
        if word in word_frequency_dict:
            word_frequency_dict[word] += 1
        else:
            word_frequency_dict[word] = 1

sorted_word_frequency_dict = dict(sorted(word_frequency_dict.items(), key=lambda item: item[1], reverse=True)) #降順にソート


word_id_map = {}    #単語とidの辞書型
id = 1
for key, value in sorted_word_frequency_dict.items():
    if value == 1: #出現頻度が1のidは0
        word_id_map[key] = 0
    else:
        word_id_map[key] = id
        id += 1
# wordのidは0~9509の計9510個のidが存在する→つまりone-hotベクトルは9510次元
onehot_dim = max(word_id_map.values()) + 1

def get_index_vector(words, word_id_map):
    # まず、必要な形状のテンソルを初期化
    vectors = torch.zeros(len(words))
    # 各単語の位置に1をセット
    for i, word in enumerate(words):
        vectors[i] = word_id_map[word]
    return vectors.long()

In [2]:
from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label


X_train = [get_index_vector(words, word_id_map) for words in word_list] #文章をidに変換したリスト
X_train = pad_sequence(X_train, batch_first=True, padding_value=0) #paddingをして系列長を揃える
Y_train = np.load("../第8章/matrix/y_train.npy")    #ラベルのロード(onehot vector)
Y_train =torch.from_numpy(Y_train)  #tensorに変換
datasets = TextDataset(X_train, Y_train)    #データセットとする
train_dataloader = DataLoader(datasets, shuffle=True, batch_size=64)    #データローダーの定義

In [3]:
from torch.nn.utils.rnn import pack_padded_sequence
from gensim.models.keyedvectors import KeyedVectors

#　最終層にsoftmaxは不要(クロスエントロピーの内部でsoftmaxをかけてくれるので)
class LSTMModel(nn.Module):
    def __init__(self,onehot_dim=onehot_dim ,embedding_dim = 300, hidden_dim = 50, output_dim = 4):
        super().__init__()
        self.emb = nn.Embedding(onehot_dim, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2)
        self.linear = nn.Linear(hidden_dim, output_dim, bias=True)

    def forward(self, x,h=None):
        x = self.emb(x)
        y, (h, c)= self.LSTM(x, h)
        h = self.linear(h[-1])
        return h

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMModel().to(device)
learning_rate = 1e-2
batch_size = 64
epochs = 500
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [5]:
# 82,83を同時にやった(GPU上でバッチ処理)


from tqdm import tqdm

for t in tqdm(range(epochs)):
    size = len(train_dataloader.dataset)
    correct = 0
    for batch, (X, y) in enumerate(train_dataloader):
        # 予測と損失の計算
        X = X.to(device)
        y = y.argmax(dim=1) #loss_fnのyにはクラスインデックスが期待されている
        y = y.to(device)
        pred = model.forward(X)
        loss = loss_fn(pred, y)

        # バックプロパゲーション
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        correct += (pred.argmax(dim=1) == y).sum().item()

    if (t+1) % 100 == 0:
        loss = loss.item()
        print(f"epoch:{t+1}, loss: {loss:>7f}, accuracy: {correct/size}")

 20%|██        | 100/500 [00:46<02:31,  2.64it/s]

epoch:100, loss: 1.287115, accuracy: 0.42708528584817246


 40%|████      | 200/500 [01:41<03:37,  1.38it/s]

epoch:200, loss: 1.183626, accuracy: 0.5096532333645736


 60%|██████    | 300/500 [02:50<02:22,  1.40it/s]

epoch:300, loss: 0.731721, accuracy: 0.760543580131209


 80%|████████  | 400/500 [03:59<01:07,  1.49it/s]

epoch:400, loss: 0.171943, accuracy: 0.9337394564198688


100%|██████████| 500/500 [05:08<00:00,  1.62it/s]

epoch:500, loss: 0.035202, accuracy: 0.9650421743205249



