In [1]:
import polars as pl
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 学習データからテキストのみをリストで抽出
FILE_PATH = "../ch6/news+aggregator/train.txt"
df = pl.read_csv(FILE_PATH, separator="\t", new_columns=["text", "categoory"])
text_list = df["text"].to_list()

#　記号を取り除く
cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in text_list]
word_list = [text.split() for text in cleaned_text_list]

word_frequency_dict = {}
# 単語リストをループして出現頻度をカウント
for words in word_list:
    for word in words:
        if word in word_frequency_dict:
            word_frequency_dict[word] += 1
        else:
            word_frequency_dict[word] = 1

sorted_word_frequency_dict = dict(sorted(word_frequency_dict.items(), key=lambda item: item[1], reverse=True)) #降順にソート


word_id_map = {}    #単語とidの辞書型
id = 1
for key, value in sorted_word_frequency_dict.items():
    if value == 1: #出現頻度が1のidは0
        word_id_map[key] = 0
    else:
        word_id_map[key] = id
        id += 1
# wordのidは0~9509の計9510個のidが存在する
vocab_size = max(word_id_map.values()) + 1

def get_index_vector(words, word_id_map):
    # まず、必要な形状のテンソルを初期化
    vectors = torch.zeros(len(words))
    # 各単語の位置に1をセット
    for i, word in enumerate(words):
        if word in word_id_map:    
            vectors[i] = word_id_map[word]
    return vectors.long()

In [2]:
from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label


X_train = [get_index_vector(words, word_id_map) for words in word_list] #文章をidに変換したリスト
X_train = pad_sequence(X_train, batch_first=True, padding_value=0) #paddingをして系列長を揃える
Y_train = np.load("../ch8/matrix/y_train.npy")    #ラベルのロード(onehot vector)
Y_train =torch.from_numpy(Y_train)  #tensorに変換
datasets = TextDataset(X_train, Y_train)    #データセットとする
train_dataloader = DataLoader(datasets, shuffle=True, batch_size=64)    #データローダーの定義

In [3]:
from torch.nn.utils.rnn import pack_padded_sequence
from gensim.models.keyedvectors import KeyedVectors

#　最終層にsoftmaxは不要(クロスエントロピーの内部でsoftmaxをかけてくれるので)
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size=vocab_size, embedding_dim=300, hidden_dim=50, output_dim=4, num_layers = 2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_size-1)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.softmax = nn.Softmax(dim=1)

        # Xavierの方法を用いて重みを初期化します
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
        nn.init.xavier_normal_(self.fc.weight)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        # 順方向と逆方向の最後の隠れ状態を結合します
        hidden = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        output = self.fc(hidden)
        output = self.softmax(output)
        return output

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BiLSTMModel().to(device)
learning_rate = 1e-2
epochs = 500
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [5]:
from tqdm import tqdm

for t in tqdm(range(epochs)):
    size = len(train_dataloader.dataset)
    correct = 0
    for batch, (X, y) in enumerate(train_dataloader):
        # 予測と損失の計算
        X = X.to(device)
        y = y.to(device)
        y = y.argmax(dim=1) #loss_fnのyにはクラスインデックスが期待されている
        pred = model.forward(X)
        loss = loss_fn(pred, y)

        # バックプロパゲーション
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        correct += (pred.argmax(dim=1) == y).sum().item()

    loss = loss.item()
    if (t+1)%100 == 0:
        print(f"epoch:{t+1}, loss: {loss:>7f}, accuracy: {correct/size}")

 20%|██        | 100/500 [01:17<04:49,  1.38it/s]

epoch:100, loss: 1.270332, accuracy: 0.452389878163074


 40%|████      | 200/500 [02:35<04:11,  1.19it/s]

epoch:200, loss: 1.112777, accuracy: 0.6018744142455482


 60%|██████    | 300/500 [03:53<02:47,  1.19it/s]

epoch:300, loss: 1.046397, accuracy: 0.7416119962511715


 80%|████████  | 400/500 [05:10<01:16,  1.30it/s]

epoch:400, loss: 0.932808, accuracy: 0.7439550140581068


100%|██████████| 500/500 [06:28<00:00,  1.29it/s]

epoch:500, loss: 0.996527, accuracy: 0.7452671040299906





In [6]:
def create_word_list(FILE_PATH):
    df = pl.read_csv(FILE_PATH, separator="\t", new_columns=["text", "categoory"])
    text_list = df["text"].to_list()

    #　記号を取り除く
    cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in text_list]
    word_list = [text.split() for text in cleaned_text_list]
    return word_list

TEST_FILE_PATH = "../ch6/news+aggregator/test.txt"
word_list = create_word_list(TEST_FILE_PATH)
X_test = [get_index_vector(words, word_id_map) for words in word_list] #文章をidに変換したリスト
X_test = pad_sequence(X_test, batch_first=True, padding_value=0) #paddingをして系列長を揃える
Y_test = np.load("../ch8/matrix/y_test.npy")    #ラベルのロード(onehot vector)
Y_test =torch.from_numpy(Y_test)  #tensorに変換
datasets = TextDataset(X_test, Y_test)    #データセットとする
test_dataloader = DataLoader(datasets, shuffle=True, batch_size=64)    #データローダーの定義

In [7]:
size = len(train_dataloader.dataset)
correct = 0
for batch, (X, y) in enumerate(train_dataloader):
    # 予測と損失の計算
    X = X.to(device)
    y = y.argmax(dim=1) #loss_fnのyにはクラスインデックスが期待されている
    y = y.to(device)
    pred = model.forward(X)
    correct += (pred.argmax(dim=1) == y).sum().item()
print(f"testdata_accuracy: {correct/size}")

testdata_accuracy: 0.7452671040299906
