In [1]:
import pickle
with open('model_logs/ja_dic.pickle', 'rb') as f: ja_dic = pickle.load(f)
with open('model_logs/en_dic.pickle', 'rb') as f: en_dic = pickle.load(f)

In [2]:
from fairseq.data.dictionary import Dictionary

In [3]:
def get_fairseq_dict(dic:dict):
    f_dic = Dictionary(pad="[PAD]", eos="[EOS]", unk="[UNK]", bos="[BOS]")
    for w, i in dic.items(): f_dic.add_symbol(w,i)
    return f_dic

In [4]:
f_dic_ja = get_fairseq_dict(ja_dic)
f_dic_en = get_fairseq_dict(en_dic)
ja_embdim = max(ja_dic.values())+1
en_embdim = max(en_dic.values())+1
print(ja_embdim, en_embdim)

23767 20176


----

In [5]:
from fairseq.models.transformer import TransformerModel, TransformerEncoder, TransformerDecoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
# めんどくさい
from fairseq.models import transformer_parser
args = transformer_parser.parse_args([])
# from fairseq.tasks.translation import TranslationTask
# transformer_parser.conflict_handler='resolve'
# print(transformer_parser)
# TranslationTask.add_args(transformer_parser)
# args = transformer_parser.parse_args(['--dropout', '0.1'], args)
# args = transformer_parser.parse_args(['--encoder-layerdrop', '0.1'], args) 

In [7]:
print(*args._get_kwargs(),sep=",")

('activation_dropout', None),('activation_fn', None),('adaptive_softmax_cutoff', None),('adaptive_softmax_dropout', None),('arch', None),('attention_dropout', None),('cross_self_attention', False),('decoder_attention_heads', None),('decoder_embed_dim', None),('decoder_embed_path', None),('decoder_ffn_embed_dim', None),('decoder_layerdrop', 0),('decoder_layers', None),('decoder_layers_to_keep', None),('decoder_learned_pos', False),('decoder_normalize_before', False),('dropout', None),('encoder_attention_heads', None),('encoder_embed_dim', None),('encoder_embed_path', None),('encoder_ffn_embed_dim', None),('encoder_layerdrop', 0),('encoder_layers', None),('encoder_layers_to_keep', None),('encoder_learned_pos', False),('encoder_normalize_before', False),('layer_wise_attention', False),('layernorm_embedding', False),('no_cross_attention', False),('no_scale_embedding', False),('no_token_positional_embeddings', False),('share_all_embeddings', False),('share_decoder_input_output_embed', False

In [8]:
class transformer_params(dict): 
    def __init__(self, *args, **kwargs):
        from fairseq.models import transformer_parser
        default = transformer_parser.parse_args([]).__dict__
        default.update(args[0])
        kwargs.update(default)
        kwargs['max_source_positions'] = 1024
        kwargs['max_target_positions'] = 1024
        super().__init__(*args, **kwargs)
        self.__dict__ = self

In [9]:
args = transformer_params({'dropout':0.1,
                          'encoder_layerdrop':0.1,
#                           'no_scale_embedding': False,
#                           'no_token_positional_embeddings': False,
#                           'encoder_learned_pos':False,
                           'encoder_layers':6,
                           'encoder_embed_dim':512,
                           'encoder_attention_heads':8, # must be a divisor of encoder_embed_dim
                           'attention_dropout':0.1,
                           'encoder_normalize_before':True,
                           'encoder_ffn_embed_dim':2048,
                           'decoder_layerdrop':0.1,
                           'share_decoder_input_output_embed':True,
                           'decoder_embed_dim':512,
                           'decoder_output_dim':en_embdim,
                           'decoder_learned_pos':False,
                           'decoder_layers':6,
                           'decoder_attention_heads':8,
                           'decoder_normalize_before':True,
                           'decoder_ffn_embed_dim':2048, 
                           'tie_adaptive_weights':True,
                           'activation_fn':'relu',
                           'activation_dropout':0.1
                        })

In [10]:
class FairseqModel(nn.Module):
    def __init__(self):
        super().__init__()
        ja_emb = nn.Embedding(num_embeddings=ja_embdim, embedding_dim=512, padding_idx=0)
        en_emb = nn.Embedding(num_embeddings=en_embdim, embedding_dim=512, padding_idx=0)
        encoder = TransformerEncoder(args=args, dictionary=f_dic_ja, embed_tokens=ja_emb)
        decoder = TransformerDecoder(args=args, dictionary=f_dic_en, embed_tokens=en_emb)
        self.model = TransformerModel(args, encoder, decoder)
        from fastai.text import CrossEntropyFlat
        self.criterion = CrossEntropyFlat(ignore_index=0)
    
    def predict(self, ja_seq, en_seq, *, mask=True):
        pred = self.model(ja_seq, torch.sum(ja_seq!=0, dim=1), en_seq)[0]
        return pred
    
    def loss(self, pred, en_ans):
        batch_size, length = en_ans.shape[0], en_ans.shape[1]
        ### train data:   eg. [BOS] I am a ... [EOS] [PAD] ###
        ### ground truth: eg. I am a ... [EOS] [PAD] [PAD] ###
        with torch.no_grad():
            accuracy = float(torch.sum((pred.argmax(dim=2)==en_ans) * (en_ans!=0)))/float(torch.sum(en_ans!=0))
        
        loss = self.criterion(pred, en_ans)
        return loss, accuracy
    
    def forward(self, ja_seq, en_seq, en_ans):
        pred = self.predict(ja_seq, en_seq)
        return self.loss(pred, en_ans)

In [11]:
params = {}
params['embdim'] = 512
params['batch_size'] =64*2
params['warmup_steps'] = 4000
params['lrate_coef'] = 25000/params['batch_size']
params['cuda_optim'] = False
params['embinit'] = False

In [12]:
model = FairseqModel().float()
optimizer = optim.Adam(model.parameters(),betas=(0.9, 0.98), eps=1e-09)
lrate_lambda = lambda step_num: params['embdim']**(-0.5)*params['lrate_coef']*min((step_num+1)**(-0.5), (step_num+1)*params['warmup_steps']**(-1.5))
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lrate_lambda])
device = torch.device("cuda:8" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)
print(sum(p.numel() for p in model.parameters()))

cuda:8
66639360


In [13]:
from Chapter10_91 import DataSet, Translator, Trainer
translator = Translator(ja_dic, en_dic)
trainset = DataSet("train", maxlength=60, pathkey="ids")
devset = DataSet("dev", maxlength=60, pathkey="ids")

In [14]:
from string import ascii_letters
from random import choices
from Chapter10_91 import EarlyStopping, Save, Tensorboard, BleuCallback, Print
examples = ["私以外私じゃないの", "私は猫です。", "日本の水墨画を一変させた。"]
translator = Translator(ja_dic, en_dic)

bleu_cb = BleuCallback(lambda seq: " ".join([translator.en_dic_rev[s] if s!=12 else ''.join(choices(ascii_letters,k=5)) for s in seq]))
es_cb = EarlyStopping(count=10)
save_cb = Save(model,'model_logs/fairseq')
tb_cb = Tensorboard('./tb_logs/', 'fairseq_91', bleu_cb)
print_cb = Print(model, translator, device, examples=examples, early_stopping=es_cb, bleu_callback=bleu_cb)
callbacks = [es_cb,bleu_cb,save_cb,tb_cb,print_cb]

savedir:model_logs/fairseq_20200804_142746
tensorboard tagname: 20200804_142749_fairseq_91


In [None]:
trainer = Trainer(translator, trainset, devset, device=device)
trainer.run(model, optimizer, epoch=300, device=device, batch_size=params['batch_size'], shuffle=True, scheduler=scheduler, callbacks=callbacks)

>10[  1/300]epoch: loss: 78.3105287, 7.3106433 | acc: 5.58691%, 15.89397% | bleu: 0.08098
   私以外私じゃないの　　　　	[UNK] . [EOS]
   私は猫です。　　　　　　　	[UNK] . [EOS]
   日本の水墨画を一変させた。	[UNK] . [EOS]
>10[  2/300]epoch: loss: 7.3614282, 5.9120545 | acc: 12.69357%, 21.00773% | bleu: 0.56890
   私以外私じゃないの　　　　	[UNK] ( [UNK] ) [EOS]
   私は猫です。　　　　　　　	[UNK] . [EOS]
   日本の水墨画を一変させた。	He was the [UNK] . [EOS]
>10[  3/300]epoch: loss: 6.4479790, 5.6665649 | acc: 16.78947%, 22.76005% | bleu: 0.61845
   私以外私じゃないの　　　　	[UNK] [EOS]
   私は猫です。　　　　　　　	He was the [UNK] . [EOS]
   日本の水墨画を一変させた。	It is said that the [UNK] . [EOS]
>10[  4/300]epoch: loss: 5.9410471, 5.4402046 | acc: 20.05725%, 25.29055% | bleu: 1.58508
   私以外私じゃないの　　　　	[UNK] [EOS]
   私は猫です。　　　　　　　	His name was [UNK] . [EOS]
   日本の水墨画を一変させた。	It is said that the [UNK] . [EOS]
>10[  5/300]epoch: loss: 5.5207765, 5.2880964 | acc: 22.86878%, 25.99651% | bleu: 2.13219
   私以外私じゃないの　　　　	[UNK] [EOS]
   私は猫です。　　　　　　　	His name was [UNK] . [EOS]
   日本の水墨画を一変させた。	It is sai