# 97. ハイパー・パラメータの調整
ニューラルネットワークのモデルや，そのハイパーパラメータを変更しつつ，開発データにおけるBLEUスコアが最大となるモデルとハイパーパラメータを求めよ．

## GPU prepare
1. 使用可能GPUの確認
2. GPUの指定
3. PyTorchで利用できるGPU数の確認

In [1]:
# 使用可能GPUの確認
!nvidia-smi

Sat Aug  6 07:11:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0 Off |                  Off |
| 30%   31C    P8    24W / 300W |      8MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    On   | 00000000:25:00.0 Off |                  Off |
| 37%   66C    P2   262W / 300W |  28586MiB / 48685MiB |     98%      Default |
|       

In [2]:
# GPUの指定
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #0番を使用するとき

In [3]:
# 確認
import torch
print(torch.cuda.device_count()) #Pytorchで使用できるGPU数を取得

1


## パラメータ探索
0. layers 6 ffn-embed-dim 2048 dropout 0.2 (base 91_trained) loss 4.812
1. layers 6 ffn-embed-dim 1024 normalize-before dropout 0.2  loss 3.721
2. layers 7 ffn-embed-dim 1024 normalize-before dropout 0.2  loss 3.7
3. layers 8 ffn-embed-dim 1024 normalize-before dropout 0.2  loss 3.694
4. layers 5 ffn-embed-dim 1024 normalize-before dropout 0.2  loss 3.785
5. layers 8 ffn-embed-dim 1024 normalize-before dropout 0.3  loss 3.946
6. layers 8 ffn-embed-dim 1024 normalize-before dropout 0.1  loss 3.343
7. layers 8 ffn-embed-dim 2048 normalize-before dropout 0.1  loss 3.235

#### trained_1: layers 6, ffn-embed-dim 1024, normalize-before
best: epoch20  loss 3.721 | nll_loss 1.862 | ppl 3.63 | wps 62663.4 | ups 9.2 | wpb 6809.5 | bsz 243.4 | num_updates 36176 | lr 0.000235128 | gnorm 0.387 | clip 0.3 | loss_scale 32 | train_wall 190 | gb_free 39.9 | wall 3912  3911.8 seconds

In [None]:
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_1 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 6 --decoder-layers 6 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_2: layers 7, ffn-embed-dim 1024, normalize-before
best: epoch20  loss 3.7 | nll_loss 1.838 | ppl 3.58 | wps 57178.1 | ups 8.4 | wpb 6809.5 | bsz 243.4 | num_updates 36175 | lr 0.000235131 | gnorm 0.381 | clip 0.2 | loss_scale 8 | train_wall 208 | gb_free 39.6 | wall 4312  4312.3 seconds

In [None]:
# モデルの層数を7に変更
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_2 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 7 --decoder-layers 7 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_3: layers 8, ffn-embed-dim 1024, normalize-before
best: epoch20  loss 3.694 | nll_loss 1.832 | ppl 3.56 | wps 53274.9 | ups 7.82 | wpb 6809.5 | bsz 243.4 | num_updates 36176 | lr 0.000235128 | gnorm 0.376 | clip 0.2 | loss_scale 32 | train_wall 223 | gb_free 39.2 | wall 4686  4685.5 seconds

In [None]:
# モデルの層数を8に変更
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_3 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 8 --decoder-layers 8 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_4: layers 5, ffn-embed-dim 1024, normalize-before
best: epoch20  loss 3.785 | nll_loss 1.934 | ppl 3.82 | wps 68689.2 | ups 10.09 | wpb 6809.5 | bsz 243.4 | num_updates 36176 | lr 0.000235128 | gnorm 0.396 | clip 0.3 | loss_scale 32 | train_wall 173 | gb_free 40.3 | wall 3616  3615.7 seconds

In [None]:
# モデルの層数を5に変更
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_4 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 5 --decoder-layers 5 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.2 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_5: layers 8, ffn-embed-dim 1024, normalize-before, dropout 0.3
best: loss 3.946 | nll_loss 2.121 | ppl 4.35 | wps 45994 | ups 6.75 | wpb 6809.5 | bsz 243.4 | num_updates 36176 | lr 0.000235128 | gnorm 0.36 | clip 0.3 | loss_scale 32 | train_wall 259 | gb_free 39.2 | wall 4937  4936.4 seconds

In [None]:
# ドロップアウト率を0.3に変更
# モデルの層数を8に変更
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_5 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 8 --decoder-layers 8 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.3 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_6: layers 8, ffn-embed-dim 1024, normalize-before, dropout 0.1
best: epoch20  loss 3.343 | nll_loss 1.424 | ppl 2.68 | wps 53495.8 | ups 7.86 | wpb 6809.5 | bsz 243.4 | num_updates 36175 | lr 0.000235131 | gnorm 0.421 | clip 0.1 | loss_scale 16 | train_wall 222 | gb_free 39.2 | wall 4607  4606.7 seconds

In [None]:
# ドロップアウト率を0.1に変更
# モデルの層数を8に変更
# ffn-embed-dimを1024に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_6 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 8 --decoder-layers 8 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 1024 --decoder-ffn-embed-dim 1024 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.1 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

#### trained_7: layers 8, ffn-embed-dim 2048, normalize-before, dropout 0.1
best: epoch20  loss 3.235 | nll_loss 1.301 | ppl 2.46 | wps 48725.4 | ups 7.16 | wpb 6809.5 | bsz 243.4 | num_updates 36176 | lr 0.000235128 | gnorm 0.421 | clip 0.1 | loss_scale 32 | train_wall 243 | gb_free 38.6 | wall 5780  5779.6 seconds

In [None]:
# ドロップアウト率を0.1に変更
# モデルの層数を8に変更
# normalize-beforeを追加
!fairseq-train ../data/ch10/91_preprocessed \
    --save-dir ../data/ch10/97_trained_7 \
    --arch transformer --share-decoder-input-output-embed \
    --encoder-layers 8 --decoder-layers 8 \
    --encoder-embed-dim 512 --decoder-embed-dim 512 \
    --encoder-ffn-embed-dim 2048 --decoder-ffn-embed-dim 2048 \
    --encoder-attention-heads 8 --decoder-attention-heads 8 \
    --encoder-normalize-before --decoder-normalize-before \
    --lr-scheduler inverse_sqrt --warmup-updates 2000 --warmup-init-lr 1e-7 \
    --lr 1e-3 \
    --dropout 0.1 \
    --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --optimizer adam --clip-norm 1.0 \
    --max-tokens 8000 \
    --max-epoch 20 \
    --fp16

## BLEUスコアの比較
1. devデータの翻訳
2. BLEUスコアの計測
3. BLEUスコアの比較

In [None]:
%%bash
export CUDA_VISIBLE_DEVICES=0
for N in 1 2 3 4 5 6 7
do
fairseq-interactive --path ../data/ch10/97_trained_$N/checkpoint_best.pt --beam 20 ../data/ch10/91_preprocessed < ../data/ch10/90_dev_tokens.ja | grep '^H' | cut -f3 > ../data/ch10/97_dev_transformed_$N.en
done

In [None]:
%%bash
export CUDA_VISIBLE_DEVICES=1
for N in 1 2 3 4 5 6 7
do
fairseq-score --sys ../data/ch10/97_dev_transformed_$N.en --ref ../data/ch10/90_dev_tokens.en > ../data/ch10/97_bleu_$N.txt
done

In [16]:
import re

bleus = []

with open('../data/ch10/94_bleu_20.txt') as f:
    x = f.readlines()[1]
    bleus.append(float(re.search(r'(BLEU4 = )(\d*\.\d*)(,)', x)[2]))

for i in range(1,8):
    with open('../data/ch10/97_bleu_' + str(i) + '.txt') as f:
        x = f.readlines()[1]
        bleus.append(float(re.search(r"(BLEU4 = )(\d*\.\d*)(,)", x)[2]))

print(f'model_0: dropout 0.2, encoder(decoder)-layers 6 ffn-embed-dim 2048                  => loss is 4.812  dev bleu is {bleus[0]}')
print(f'model_1: dropout 0.2, encoder(decoder)-layers 6 ffn-embed-dim 1024 normalize-before => loss is 3.721  dev bleu is {bleus[1]}')
print(f'model_2: dropout 0.2, encoder(decoder)-layers 7 ffn-embed-dim 1024 normalize-before => loss is 3.700  dev bleu is {bleus[2]}')
print(f'model_3: dropout 0.2, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.694  dev bleu is {bleus[3]}')
print(f'model_4: dropout 0.2, encoder(decoder)-layers 5 ffn-embed-dim 1024 normalize-before => loss is 3.785  dev bleu is {bleus[4]}')
print(f'model_5: dropout 0.3, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.946  dev bleu is {bleus[5]}')
print(f'model_6: dropout 0.1, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.343  dev bleu is {bleus[6]}')
print(f'model_7: dropout 0.1, encoder(decoder)-layers 8 ffn-embed-dim 2048 normalize-before => loss is 3.235  dev bleu is {bleus[7]}')

model_0: dropout 0.2, encoder(decoder)-layers 6 ffn-embed-dim 2048                  => loss is 4.812  dev bleu is 6.05
model_1: dropout 0.2, encoder(decoder)-layers 6 ffn-embed-dim 1024 normalize-before => loss is 3.721  dev bleu is 20.61
model_2: dropout 0.2, encoder(decoder)-layers 7 ffn-embed-dim 1024 normalize-before => loss is 3.700  dev bleu is 21.28
model_3: dropout 0.2, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.694  dev bleu is 20.13
model_4: dropout 0.2, encoder(decoder)-layers 5 ffn-embed-dim 1024 normalize-before => loss is 3.785  dev bleu is 20.39
model_5: dropout 0.3, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.946  dev bleu is 20.36
model_6: dropout 0.1, encoder(decoder)-layers 8 ffn-embed-dim 1024 normalize-before => loss is 3.343  dev bleu is 19.42
model_7: dropout 0.1, encoder(decoder)-layers 8 ffn-embed-dim 2048 normalize-before => loss is 3.235  dev bleu is 19.17
