## 97. ハイパー・パラメータの調整
<p>ニューラルネットワークのモデルや，そのハイパーパラメータを変更しつつ，開発データにおけるBLEUスコアが最大となるモデルとハイパーパラメータを求めよ．</p>

In [1]:
from fastai_model import Model, get_param, get_elms
from Chapter10_91 import Translator, DataSet, Trainer
from Chapter10_91 import Print, EarlyStopping, Save, Tensorboard, BleuCallback
import pickle
with open('model_logs/ja_dic.pickle', 'rb') as f: ja_dic = pickle.load(f)
with open('model_logs/en_dic.pickle', 'rb') as f: en_dic = pickle.load(f)
ja_dicsize = max(ja_dic.values())+1
en_dicsize = max(en_dic.values())+1

In [2]:
trainset = DataSet("train", maxlength=60)
devset = DataSet("dev", maxlength=60)

In [3]:
def run(params, tags:[str], preprinter:Print.Printer, tbsf:str):
    from random import choices
    from string import ascii_letters
    model, optimizer, scheduler, device = get_elms(params, 'cuda:7', ja_dicsize, en_dicsize)
    translator = Translator(ja_dic, en_dic)
    
    bleu_cb = BleuCallback(lambda seq: " ".join([translator.en_dic_rev[s] if s!=12 else ''.join(choices(ascii_letters,k=5)) for s in seq]))
    es_cb = EarlyStopping(count=10)
    save_cb = Save(model,'model_logs/grid_'+tbsf)
    tb_cb = Tensorboard('./tb_logs/','97_'+tbsf, bleu_cb)
    print_cb = Print(model, translator, device, print_example=False, early_stopping=es_cb, bleu_callback=bleu_cb,
                     tags=tags, preprinter=preprinter)
    callbacks = [es_cb,bleu_cb,save_cb,tb_cb,print_cb]
    
    trainer = Trainer(translator, trainset, devset, device=device)
    trainer.run(model, optimizer, epoch=300, device=device, batch_size=params['batch_size'], shuffle=True, scheduler=scheduler, callbacks=callbacks)
    return print_cb.printer

In [4]:
for k,v in get_param().items(): print(f"{k}: {v}")

cuda_optim: False
n_layers: 6
n_heads: 8
d_model: 256
d_head: 32
d_inner: 1024
drop: 0.1
bias: True
scale: True
double_drop: True
lrate_coef: 1
warmup_steps: 4000


#### 学習率

In [None]:
preprinter=None

batch_size = 64 * 2
mm = 25000/batch_size
for coef in reversed([mm/2, mm, mm*2, mm*4]):
    params = get_param(lrate_coef=coef)
    params['batch_size'] = batch_size
    preprinter = run(params, [f"coef: {coef}"], preprinter, f"lr={coef}")

coef: 781.25
>10[  1/300]epoch: loss: 5.0056849, 3.5814937 | acc: 27.59806%, 38.62557% | bleu: 8.12692
>10[  2/300]epoch: loss: 2.9448651, 2.7910606 | acc: 45.54768%, 47.28510% | bleu: 16.11501
>10[  3/300]epoch: loss: 2.4472223, 2.5858688 | acc: 51.34888%, 49.91033% | bleu: 18.61977
>10[  4/300]epoch: loss: 2.2526839, 2.5256665 | acc: 53.84561%, 50.84934% | bleu: 19.90933
>10[  5/300]epoch: loss: 2.1359252, 2.4899038 | acc: 55.38074%, 51.49300% | bleu: 19.93839
>10[  6/300]epoch: loss: 2.0537251, 2.4648615 | acc: 56.48514%, 52.00372% | bleu: 20.65718
>10[  7/300]epoch: loss: 1.9911473, 2.4228295 | acc: 57.35015%, 52.69336% | bleu: 20.56033
  9[  8/300]epoch: loss: 1.9404115, 2.4406689 | acc: 58.06638%, 52.51771% | bleu: 20.58176
  8[  9/300]epoch: loss: 1.8985973, 2.4397270 | acc: 58.66586%, 52.59391% | bleu: 19.73388
  7[ 10/300]epoch: loss: 1.8632702, 2.4516230 | acc: 59.16791%, 53.26219% | bleu: 20.90524
  6[ 11/300]epoch: loss: 1.8325963, 2.4267539 | acc: 59.61973%, 53.03571% | bl

In [1]:
preprinter=None

batch_size = 64 * 2
mm = 25000/batch_size
for coef in [mm*8, mm*16]:
    params = get_param(lrate_coef=coef)
    params['batch_size'] = batch_size
    preprinter = run(params, [f"coef: {coef}"], preprinter, f"lr={coef}")

coef: 1562.5
>10[  1/300]epoch: loss: 4.5597286, 3.2549544 | acc: 30.90973%, 41.32296% | bleu: 10.73021
>10[  2/300]epoch: loss: 2.9109816, 2.8357891 | acc: 45.31664%, 46.42694% | bleu: 13.96774
>10[  3/300]epoch: loss: 2.5248783, 2.6479896 | acc: 49.90306%, 49.38682% | bleu: 17.63416
>10[  4/300]epoch: loss: 2.3449820, 2.5850359 | acc: 52.20555%, 50.03968% | bleu: 18.67045
>10[  5/300]epoch: loss: 2.2342428, 2.5176594 | acc: 53.68307%, 51.04338% | bleu: 18.75140
>10[  6/300]epoch: loss: 2.1551417, 2.5085322 | acc: 54.77070%, 51.18486% | bleu: 18.24711
>10[  7/300]epoch: loss: 2.0949881, 2.4812724 | acc: 55.61117%, 51.47829% | bleu: 19.69604
>10[  8/300]epoch: loss: 2.0465202, 2.4729025 | acc: 56.28689%, 51.89367% | bleu: 19.09868
>10[  9/300]epoch: loss: 2.0060579, 2.4561306 | acc: 56.86289%, 52.20875% | bleu: 20.45520
>10[ 10/300]epoch: loss: 1.9708843, 2.4486791 | acc: 57.37173%, 52.35896% | bleu: 20.30135
>10[ 11/300]epoch: loss: 1.9419949, 2.4390411 | acc: 57.78025%, 52.13435% | b

#### n_heads

In [3]:
preprinter=None

batch_size = 64 * 2
for n_heads in [1,2,3,8,12,24]:
    params = get_param(n_heads=n_heads,lrate_coef=25000/batch_size*4)
    params['batch_size'] = batch_size
    preprinter = run(params, [f"n_heads:{n_heads}"], preprinter, f"n_heads={n_heads}")

n_heads:1
>10[  1/300]epoch: loss: 5.0635228, 3.6812657 | acc: 26.97561%, 36.98171% | bleu: 7.58031
>10[  2/300]epoch: loss: 3.1356584, 2.9731640 | acc: 42.83549%, 44.58800% | bleu: 13.42361
>10[  3/300]epoch: loss: 2.6978646, 2.8036626 | acc: 47.68635%, 47.16020% | bleu: 15.88228
>10[  4/300]epoch: loss: 2.5185692, 2.7514082 | acc: 49.90861%, 47.50606% | bleu: 15.53850
>10[  5/300]epoch: loss: 2.4094214, 2.7064816 | acc: 51.34185%, 48.84439% | bleu: 17.24788
>10[  6/300]epoch: loss: 2.3345092, 2.6535198 | acc: 52.34793%, 49.22242% | bleu: 16.91345
>10[  7/300]epoch: loss: 2.2780357, 2.6329643 | acc: 53.12014%, 49.58332% | bleu: 17.57351
>10[  8/300]epoch: loss: 2.2337657, 2.6163961 | acc: 53.71612%, 49.80411% | bleu: 17.89081
>10[  9/300]epoch: loss: 2.1977492, 2.5993046 | acc: 54.20759%, 50.29335% | bleu: 18.07466
  9[ 10/300]epoch: loss: 2.1670577, 2.6255682 | acc: 54.64531%, 50.19777% | bleu: 18.38610
  8[ 11/300]epoch: loss: 2.1407144, 2.6041052 | acc: 55.01282%, 50.45333% | bleu:

#### d_head

In [None]:
preprinter=None

batch_size = 64 * 2
for d_head in [1,4,16,48]:
    params = get_param(d_head=d_head,lrate_coef=25000/batch_size*4)
    params['batch_size'] = batch_size
    preprinter = run(params, [f"d_head:{d_head}"], preprinter, f"d_head={d_head}")

d_head:1
>10[  1/300]epoch: loss: 5.1833167, 4.0328631 | acc: 25.24699%, 32.62761% | bleu: 4.79851
>10[  2/300]epoch: loss: 3.5842983, 3.5348748 | acc: 36.68676%, 36.71297% | bleu: 6.72088
>10[  3/300]epoch: loss: 3.2546384, 3.3799939 | acc: 39.61449%, 37.90380% | bleu: 7.50329
[  4/300]epoch [ 654/3163]steps:[ja:58|en:59] loss: 0.0249528 | acc: 39.63576%
