In [5]:
import polars as pl
import re
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 学習データからテキストのみをリストで抽出
FILE_PATH = "../ch6/news+aggregator/train.txt"
df = pl.read_csv(FILE_PATH, separator="\t", new_columns=["text", "categoory"])
text_list = df["text"].to_list()

#　記号を取り除く
cleaned_text_list = [re.sub(r'[^a-zA-Z\s]', "", text) for text in text_list]
word_list = [text.split() for text in cleaned_text_list]

word_frequency_dict = {}
# 単語リストをループして出現頻度をカウント
for words in word_list:
    for word in words:
        if word in word_frequency_dict:
            word_frequency_dict[word] += 1
        else:
            word_frequency_dict[word] = 1

sorted_word_frequency_dict = dict(sorted(word_frequency_dict.items(), key=lambda item: item[1], reverse=True)) #降順にソート


word_id_map = {}    #単語とidの辞書型
id = 1
for key, value in sorted_word_frequency_dict.items():
    if value == 1: #出現頻度が1のidは0
        word_id_map[key] = 0
    else:
        word_id_map[key] = id
        id += 1
# wordのidは0~9509の計9510個のidが存在する
vocab_size = max(word_id_map.values()) + 1

def get_index_vector(words, word_id_map):
    # まず、必要な形状のテンソルを初期化
    vectors = torch.zeros(len(words))
    # 各単語の位置に1をセット
    for i, word in enumerate(words):
        if word in word_id_map:    
            vectors[i] = word_id_map[word]
    return vectors.long()

from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label


X_train = [get_index_vector(words, word_id_map) for words in word_list] #文章をidに変換したリスト
X_train = pad_sequence(X_train, batch_first=True, padding_value=0) #paddingをして系列長を揃える
Y_train = np.load("../ch8/matrix/y_train.npy")    #ラベルのロード(onehot vector)
Y_train =torch.from_numpy(Y_train)  #tensorに変換
datasets = TextDataset(X_train, Y_train)    #データセットとする
train_dataloader = DataLoader(datasets, shuffle=True, batch_size=64)    #データローダーの定義

In [6]:
from torch.nn.utils.rnn import pack_padded_sequence
from gensim.models.keyedvectors import KeyedVectors

# softmaxを無くしたら高性能になった
class CNNModel(nn.Module):
    def __init__(self, vocab_size=vocab_size, embedding_dim=300, hidden_dim=50, output_dim=4, kernel_size=3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.content_conv = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=hidden_dim,
                      kernel_size=kernel_size),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=(18- kernel_size + 1))
        )
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.emb(x)
        content_out = self.content_conv(x.permute(0, 2, 1))
        reshaped = content_out.view(content_out.size(0), -1)
        x = self.linear(reshaped)
        return x

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loss_fn = nn.CrossEntropyLoss()

In [8]:
def train_epoch(model, optimizer, dataloader, device):
    model.train
    size = len(train_dataloader.dataset)
    min_loss = 100
    correct = 0
    for batch, (X, y) in enumerate(train_dataloader):
        # 予測と損失の計算
        X = X.to(device)
        y = y.to(device)
        y = y.argmax(dim=1) #loss_fnのyにはクラスインデックスが期待されている
        pred = model(X)
        loss = loss_fn(pred, y)
        # バックプロパゲーション
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if min_loss>loss:
            min_loss=loss
        correct += (pred.argmax(dim=1) == y).sum().item()

    min_loss = min_loss.item()
    return min_loss


In [14]:
import optuna

def trial_optimizer(trial, model):
    optimizer_names = ['Adam', 'AdamW', 'RAdam']
    optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)

    if optimizer_name == optimizer_names[0]:
        adam_lr = trial.suggest_loguniform('Adam_lr', 1e-5, 1e-1)
        optimizer = torch.optim.Adam(model.parameters(), lr=adam_lr)
    elif optimizer_name == optimizer_names[1]:
        adam_lr = trial.suggest_loguniform('AdamW_lr', 1e-5, 1e-1)
        optimizer = torch.optim.Adam(model.parameters(), lr=adam_lr)
    else:
        adam_lr = trial.suggest_loguniform('RAdam_lr', 1e-5, 1e-1)
        optimizer = torch.optim.Adam(model.parameters(), lr=adam_lr)
    return optimizer

EPOCH = 10
def objective(trial):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # 最適化対象のコード
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)      # 1e-5~1e-1
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64]) # 3 or 5 or 7
    model = CNNModel()
    optimizer = trial_optimizer(trial, model)
    train_dataloader =  DataLoader(datasets, shuffle=True, batch_size=batch_size)
    min_losses = []
    for i in range(EPOCH):
        loss = train_epoch(model, optimizer, train_dataloader, device)
        min_losses.append(loss)
    min_loss = min(min_losses)
    return min_loss

# 最適化（score：最小化, トライアル数：100）
study = optuna.create_study()
study.optimize(objective, n_trials=5)
study.best_params

[I 2024-06-28 17:39:24,147] A new study created in memory with name: no-name-5016dfc4-a369-4b21-935c-cdefeecf993e
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)      # 1e-5~1e-1
  adam_lr = trial.suggest_loguniform('Adam_lr', 1e-5, 1e-1)
[I 2024-06-28 17:39:54,333] Trial 0 finished with value: 0.43838366866111755 and parameters: {'lr': 7.76669424153887e-05, 'batch_size': 32, 'optimizer': 'Adam', 'Adam_lr': 0.02807059479780017}. Best is trial 0 with value: 0.43838366866111755.
  adam_lr = trial.suggest_loguniform('RAdam_lr', 1e-5, 1e-1)
[I 2024-06-28 17:40:25,832] Trial 1 finished with value: 0.8061231374740601 and parameters: {'lr': 0.00044008759513932983, 'batch_size': 16, 'optimizer': 'RAdam', 'RAdam_lr': 0.045013088226226225}. Best is trial 0 with value: 0.43838366866111755.
  adam_lr = trial.suggest_loguniform('AdamW_lr', 1e-5, 1e-1)
[I 2024-06-28 17:41:01,478] Trial 2 finished with value: 0.9298996329307556 and parameters: {'lr': 0.04181872812866105, 'batch_size': 64, 'optimize

{'lr': 7.76669424153887e-05,
 'batch_size': 32,
 'optimizer': 'Adam',
 'Adam_lr': 0.02807059479780017}