## 80. ID番号への変換
1. ~~単語をIDに変換する辞書を作成・保存(重いため,2回目以降はセル2を実行)~~
2. 辞書の読み込み
3. 単語列をID列に変換する関数

In [2]:
# pklファイルから読み込み
import pickle

with open('../data/ch09/name_to_id.pkl', 'rb') as tf:
    name_to_id = pickle.load(tf)

In [3]:
# 与えられた単語列に対し, ID番号の列を返す関数
# ch06ではCountVectorizerを利用したため, 今回もCountVectorizerを活用する
from sklearn.feature_extraction.text import CountVectorizer

def convert_words_to_ids(words):
    '''
    input :words(単語列)
    output:ids(ID番号列)
    '''
    # analyzer: 単語列に前処理を加え, listに変換する関数
    analyzer = CountVectorizer().build_analyzer()
    word_list = analyzer(words)
    
    ids = []
    for word in word_list:
        if word in name_to_id:
            ids.append(name_to_id[word])
        else:
            ids.append(0)  # 未知語の場合, IDを0とする
    
    return ids

## GPU prepare
1. 使用可能GPUの確認
2. GPUの指定
3. PyTorchで利用できるGPU数の確認

In [4]:
# 使用可能GPUの確認
!nvidia-smi

Wed Jun 15 21:44:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0 Off |                  Off |
| 30%   28C    P8    16W / 300W |   2181MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   28C    P8    15W / 300W |      8MiB / 48685MiB |      0%      Default |
|       

In [5]:
# GPUの指定
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1' #0番を使用するとき

In [6]:
# 確認
import torch
print(torch.cuda.device_count()) #Pytorchで使用できるGPU数を取得

1


## prepare
1. 語彙数の取得
2. 学習データの用意(ラベル)
3. 学習データの用意(特徴量)
4. 乱数の種を固定

In [7]:
# 語彙数の取得(ID:0の単語はまとめて1語とする), 未知語, paddingを考慮
vocab_size = max(name_to_id.values())+2

In [8]:
# 訓練・検証・評価データの用意
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

# ラベル: ch08の出力を利用
Y_train = np.loadtxt('../data/ch08/Y_train.txt')
Y_valid = np.loadtxt('../data/ch08/Y_valid.txt')
Y_test = np.loadtxt('../data/ch08/Y_test.txt')

# pytorch用に変換
Y_train_long = torch.tensor(Y_train, dtype=torch.int64)
Y_valid_long = torch.tensor(Y_valid, dtype=torch.int64)
Y_test_long = torch.tensor(Y_test, dtype=torch.int64)

In [9]:
# 特徴量: convert_words_to_ids(80)を利用
def convert_text_to_features(fname):
    '''
    input :fname
    output:features(tensor)
    '''
    with open(fname, encoding='utf-8') as f:
        lines = f.readlines()
    
    # id列(list)のリストに変換
    ids_list = [convert_words_to_ids(line) for line in lines]
    
    # id列(tensor)のリストに変換
    ids_tensor = [torch.tensor(ids, dtype=torch.int64) for ids in ids_list]
    
    # 最大のid+1(vocab_size-1)でパディング
    features = torch.nn.utils.rnn.pad_sequence(ids_tensor, batch_first=True, padding_value=vocab_size-1)
    
    return features

# 特徴量抽出
X_train_long = convert_text_to_features('../data/ch06/train.txt')
X_valid_long = convert_text_to_features('../data/ch06/valid.txt')
X_test_long = convert_text_to_features('../data/ch06/test.txt')

In [10]:
# 乱数シードの固定
import random

def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

## 86. 畳み込みニューラルネットワーク (CNN)
1. CNNの定義
2. ~~CNNの予測~~

In [11]:
# CNNの定義
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, hidden_size, kernel_size, batch_size):
        # 層の定義
        super().__init__()
        self.batch_size = batch_size
        
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_size-1)
        self.conv = nn.Conv2d(1, hidden_size, kernel_size=(kernel_size, embedding_dim), padding=(1,0))
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

        # GPUに移す
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self = self.to(self.device)
    
    def forward(self, x):
        # embedding
        emb_out = self.emb(x)          # batch_size, sentence_length, emb_dim

        # conv
        conv_in = emb_out.unsqueeze(1) # batch_size, input_channels=1, sentence_length, emb_dim
        conv_out = self.conv(conv_in)  # batch_size, output_channels, sentence_length+2*pad-kernel+1, 1

        # relu
        relu_in = conv_out.squeeze(3)  # batch_size, output_channels, sentence_length+2*pad-kernel+1, 1
        relu_out = self.relu(relu_in)  # batch_size, output_channels, sentence_length+2*pad-kernel+1

        # pool
        pool_out = F.max_pool2d(relu_out, kernel_size=(1, relu_out.size(2))) # batch_size, output_channels, 1

        # fc
        fc_in = pool_out.squeeze(2)    # batch_size, output_channels
        fc_out = self.fc(fc_in)        # batch_size, output_size
        
        return fc_out

## 88. パラメータチューニング
1. train, validの定義
2. objectiveの定義
3. optunaによるパラメータチューニング
4. 結果表示

In [12]:
def train(model, device, train_loader, optimizer):
    model.train()
    for X, Y in train_loader:
        X, Y = X.to(device), Y.to(device)
        # forward
        optimizer.zero_grad()
        predict_y = model(X)
        loss = F.cross_entropy(predict_y, Y)
        # backward
        loss.backward()
        # 更新
        optimizer.step()

def valid(model, device, valid_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for X, Y in valid_loader:
            X, Y = X.to(device), Y.to(device)
            predict_y = model(X)
            predict_label = torch.max(predict_y, 1)[1]
            correct += predict_label.eq(Y.view_as(predict_label)).sum().item()
    return 1 - correct/len(valid_loader.dataset)

In [26]:
import torch.optim as optim

# batch_sizeは32とする
batch_size = 32

# dataloaderの定義
train_dataset2 = torch.utils.data.TensorDataset(X_train_long, Y_train_long)
train_dataloader2 = torch.utils.data.DataLoader(train_dataset2, batch_size=batch_size, drop_last=True)

valid_dataset2 = torch.utils.data.TensorDataset(X_valid_long, Y_valid_long)
valid_dataloader2 = torch.utils.data.DataLoader(valid_dataset2, batch_size=batch_size, drop_last=True)

def objective(trial):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    # hidden_size
    hidden_size = int(trial.suggest_discrete_uniform('hidden_size', 50, 100, 10))

    # lr
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)

    # kernel_size
    kernel_size = trial.suggest_int('kernel_size', 3, 9)

    # モデルの作成
    model = CNN(vocab_size, 300, 4, hidden_size, kernel_size, batch_size)
    optimizer = optim.Adam(model.parameters(), lr)

    for epoch in range(10):
        train(model, device, train_dataloader2, optimizer)
        error_rate = valid(model, device, valid_dataloader2)
    
    return error_rate


In [27]:
import optuna

study = optuna.create_study()
study.optimize(objective, n_trials=10)

[32m[I 2022-06-16 01:39:37,613][0m A new study created in memory with name: no-name-16b3320d-55c1-4c60-af44-53a617abfb35[0m
[32m[I 2022-06-16 02:11:53,901][0m Trial 0 finished with value: 0.3967065868263473 and parameters: {'hidden_size': 90.0, 'lr': 0.0641889460746234, 'kernel_size': 8}. Best is trial 0 with value: 0.3967065868263473.[0m
[32m[I 2022-06-16 02:18:39,673][0m Trial 1 finished with value: 0.31586826347305386 and parameters: {'hidden_size': 50.0, 'lr': 1.8645219781154473e-05, 'kernel_size': 3}. Best is trial 1 with value: 0.31586826347305386.[0m
[32m[I 2022-06-16 02:40:12,903][0m Trial 2 finished with value: 0.18862275449101795 and parameters: {'hidden_size': 60.0, 'lr': 9.909975592654636e-05, 'kernel_size': 8}. Best is trial 2 with value: 0.18862275449101795.[0m
[32m[I 2022-06-16 02:54:35,706][0m Trial 3 finished with value: 0.16017964071856283 and parameters: {'hidden_size': 80.0, 'lr': 0.00034206115368394415, 'kernel_size': 4}. Best is trial 3 with value: 0

In [28]:
print(f'best params: {study.best_params}')
print(f'best value: {study.best_value}')

best params: {'hidden_size': 50.0, 'lr': 0.00855755300540021, 'kernel_size': 8}
best value: 0.1302395209580839
