本notebook使用的是一般的transformer模型

## 1、导入相关包

In [1]:
import os
import time
from functools import partial
from pprint import pprint
import yaml
import logging
import argparse

from attrdict import AttrDict
import numpy as np

import paddle
import paddle.nn as nn
from paddle.nn import TransformerDecoder,TransformerDecoderLayer,TransformerEncoder,TransformerEncoderLayer
from paddle.nn import functional as F
import paddle.distributed as dist
from paddle.io import DataLoader,BatchSampler
from paddlenlp.data import Vocab, Pad
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import TransformerModel, InferTransformerModel, CrossEntropyCriterion, position_encoding_init
from paddlenlp.utils.log import logger

from helper.utils import post_process_seq

  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  from .autonotebook import tqdm as notebook_tqdm


## 2、准备数据集

  中文需要Jieba+BPE，英文需要BPE  

数据预处理

In [2]:
# 数据预处理过程，包括jieba分词、bpe分词和词表。
!bash ./helper/preprocess.sh

jieba tokenize...


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.822 seconds.
Prefix dict has been built successfully.
source learn-bpe and apply-bpe...
no pair has frequency >= 2. Stopping
target learn-bpe and apply-bpe...
no pair has frequency >= 2. Stopping
source get-vocab. if loading pretrained model, use its vocab.
target get-vocab. if loading pretrained model, use its vocab.
Over.


构造Dataloader

In [3]:
# 自定义读取本地数据的方法
def read(src_path, tgt_path, is_predict=False):
    if is_predict:
        with open(src_path, 'r', encoding='utf8') as src_f:
            for src_line in src_f.readlines():
                src_line = src_line.strip()
                if not src_line:
                    continue
                yield {'src':src_line, 'tgt':''}
    else:
        with open(src_path, 'r', encoding='utf8') as src_f, open(tgt_path, 'r', encoding='utf8') as tgt_f:
            for src_line, tgt_line in zip(src_f.readlines(), tgt_f.readlines()):
                src_line = src_line.strip()
                if not src_line:
                    continue
                tgt_line = tgt_line.strip()
                if not tgt_line:
                    continue
                yield {'src':src_line, 'tgt':tgt_line}
 # 过滤掉长度 ≤min_len或者≥max_len 的数据            
def min_max_filer(data, max_len, min_len=0):
    # 1 for special tokens.
    data_min_len = min(len(data[0]), len(data[1])) + 1
    data_max_len = max(len(data[0]), len(data[1])) + 1
    return (data_min_len >= min_len) and (data_max_len <= max_len)

In [4]:
# 创建训练集、验证集的dataloader
def create_data_loader(args):
    train_dataset = load_dataset(read, src_path=args.training_file.split(',')[0], tgt_path=args.training_file.split(',')[1], lazy=False)
    dev_dataset = load_dataset(read, src_path=args.validation_file.split(',')[0], tgt_path=args.validation_file.split(',')[1], lazy=False)

    src_vocab = Vocab.load_vocabulary(
        args.src_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])
    trg_vocab = Vocab.load_vocabulary(
        args.trg_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])

    padding_vocab = (
        lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor
    )
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample['src'].split()
        target = sample['tgt'].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    # 训练集dataloader和验证集dataloader
    data_loaders = []
    for i, dataset in enumerate([train_dataset, dev_dataset]):
        dataset = dataset.map(convert_samples, lazy=False).filter(
            partial(min_max_filer, max_len=args.max_length))

        # BatchSampler: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/BatchSampler_cn.html
        batch_sampler = BatchSampler(dataset,batch_size=args.batch_size, shuffle=True,drop_last=False)
        
        # DataLoader: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html
        data_loader = DataLoader(
            dataset=dataset,
            batch_sampler=batch_sampler,
            collate_fn=partial(
                prepare_train_input,
                bos_idx=args.bos_idx,
                eos_idx=args.eos_idx,
                pad_idx=args.bos_idx),
                num_workers=0,
                return_list=True)
        data_loaders.append(data_loader)

    return data_loaders


def prepare_train_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by training into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])
    trg_word = word_pad([[bos_idx] + inst[1] for inst in insts])
    lbl_word = np.expand_dims(
        word_pad([inst[1] + [eos_idx] for inst in insts]), axis=2)

    data_inputs = [src_word, trg_word, lbl_word]

    return data_inputs


In [5]:
# 创建测试集的dataloader，原理步骤同上（创建训练集、验证集的dataloader）
def create_infer_loader(args):
    dataset = load_dataset(read, src_path=args.predict_file, tgt_path=None, is_predict=True, lazy=False)

    src_vocab = Vocab.load_vocabulary(
        args.src_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])
    trg_vocab = Vocab.load_vocabulary(
        args.trg_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])

    padding_vocab = (
        lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor
    )
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample['src'].split()
        target = sample['tgt'].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    # BatchSampler: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/BatchSampler_cn.html
    batch_sampler = BatchSampler(dataset,batch_size=args.infer_batch_size,drop_last=False)
    
    # DataLoader: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html
    data_loader = DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=partial(
            prepare_infer_input,
            bos_idx=args.bos_idx,
            eos_idx=args.eos_idx,
            pad_idx=args.bos_idx),
            num_workers=0,
            return_list=True)
    return data_loader, trg_vocab.to_tokens

def prepare_infer_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by beam search decoder into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])

    return [src_word, ]

## 3、准备网络模型

搭建模型

In [6]:
# 网络基本模块
class WordEmbedding(nn.Layer):
    def __init__(self, vocab_size, emb_dim, bos_id=0):
        super(WordEmbedding, self).__init__()
        self.emb_dim = emb_dim

        self.word_embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=bos_id,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(0.0, emb_dim ** (-0.5))),
        )

    def forward(self, word):
        word_emb = self.emb_dim**0.5 * self.word_embedding(word)
        return word_emb


class PositionalEmbedding(nn.Layer):
    def __init__(self, emb_dim, max_length):
        super(PositionalEmbedding, self).__init__()
        self.emb_dim = emb_dim

        self.pos_encoder = nn.Embedding(
            num_embeddings=max_length,
            embedding_dim=self.emb_dim,
            weight_attr=paddle.ParamAttr(
                initializer=paddle.nn.initializer.Assign(position_encoding_init(max_length, self.emb_dim))
            ),
        )

    def forward(self, pos):
        pos_emb = self.pos_encoder(pos)
        pos_emb.stop_gradient = True
        return pos_emb


In [7]:
class TransformerModel(nn.Layer):
    """
    Args:
        src_vocab_size (int):
            The size of source vocabulary.
        trg_vocab_size (int):
            The size of target vocabulary.
        max_length (int):
            The maximum length of input sequences.
        num_encoder_layers (int):
            The number of sub-layers to be stacked in the encoder.
        num_decoder_layers (int):
            The number of sub-layers to be stacked in the decoder.
        n_head (int):
            The number of head used in multi-head attention.
        d_model (int):
            The dimension for word embeddings, which is also the last dimension of
            the input and output of multi-head attention, position-wise feed-forward
            networks, encoder and decoder.
        d_inner_hid (int):
            Size of the hidden layer in position-wise feed-forward networks.
        dropout (float):
            Dropout rates. Used for pre-process, activation and inside attention.
        weight_sharing (bool):
            Whether to use weight sharing.
        attn_dropout (float):
            The dropout probability used in MHA to drop some attention target.
            If None, use the value of dropout. Defaults to None.
        act_dropout (float):
            The dropout probability used after FFN activation. If None, use
            the value of dropout. Defaults to None.
        bos_id (int, optional):
            The start token id and also be used as padding id. Defaults to 0.
        eos_id (int, optional):
            The end token id. Defaults to 1.
        pad_id (int, optional):
            The pad token id. Defaults to None. If it's None, the bos_id will be used as pad_id.
        activation (str, optional):
            The activation used in FFN. Defaults to "relu".
        normalize_before (bool, optional):
            Whether to apply pre-normalization. Defaults to True.
    """

    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        max_length,
        num_encoder_layers,
        num_decoder_layers,
        n_head,
        d_model,
        d_inner_hid,
        dropout,
        weight_sharing,
        attn_dropout=None,
        act_dropout=None,
        bos_id=0,
        eos_id=1,
        pad_id=None,
        activation="relu",
        normalize_before=True,
    ):
        super(TransformerModel, self).__init__()
        self.trg_vocab_size = trg_vocab_size
        self.emb_dim = d_model
        self.bos_id = bos_id
        self.eos_id = eos_id
        self.pad_id = pad_id if pad_id is not None else self.bos_id
        self.dropout = dropout

        self.src_word_embedding = WordEmbedding(vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.pad_id)
        self.src_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)
        if weight_sharing:
            assert (
                src_vocab_size == trg_vocab_size
            ), "Vocabularies in source and target should be same for weight sharing."
            self.trg_word_embedding = self.src_word_embedding
            self.trg_pos_embedding = self.src_pos_embedding
        else:
            self.trg_word_embedding = WordEmbedding(vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.pad_id)
            self.trg_pos_embedding = PositionalEmbedding(emb_dim=d_model, max_length=max_length)

        if not normalize_before:
            encoder_layer = TransformerEncoderLayer(
                d_model=d_model,
                nhead=n_head,
                dim_feedforward=d_inner_hid,
                dropout=dropout,
                activation=activation,
                attn_dropout=attn_dropout,
                act_dropout=act_dropout,
                normalize_before=normalize_before,
            )
            encoder_with_post_norm = TransformerEncoder(encoder_layer, num_encoder_layers)

            decoder_layer = TransformerDecoderLayer(
                d_model=d_model,
                nhead=n_head,
                dim_feedforward=d_inner_hid,
                dropout=dropout,
                activation=activation,
                attn_dropout=attn_dropout,
                act_dropout=act_dropout,
                normalize_before=normalize_before,
            )
            decoder_with_post_norm = TransformerDecoder(decoder_layer, num_decoder_layers)

        self.transformer = paddle.nn.Transformer(
            d_model=d_model,
            nhead=n_head,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=d_inner_hid,
            dropout=dropout,
            attn_dropout=attn_dropout,
            act_dropout=act_dropout,
            activation=activation,
            normalize_before=normalize_before,
            custom_encoder=None if normalize_before else encoder_with_post_norm,
            custom_decoder=None if normalize_before else decoder_with_post_norm,
        )

        if weight_sharing:
            self.linear = lambda x: paddle.matmul(
                x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True
            )
        else:
            self.linear = nn.Linear(in_features=d_model, out_features=trg_vocab_size, bias_attr=False)

    def forward(self, src_word, trg_word):
        src_max_len = paddle.shape(src_word)[-1]
        trg_max_len = paddle.shape(trg_word)[-1]
        src_slf_attn_bias = (
            paddle.cast(src_word == self.pad_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
        )
        src_slf_attn_bias.stop_gradient = True
        trg_slf_attn_bias = self.transformer.generate_square_subsequent_mask(trg_max_len)
        trg_slf_attn_bias.stop_gradient = True
        trg_src_attn_bias = src_slf_attn_bias
        src_pos = paddle.cast(src_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
            start=0, end=src_max_len, dtype=src_word.dtype
        )
        trg_pos = paddle.cast(trg_word != self.pad_id, dtype=src_word.dtype) * paddle.arange(
            start=0, end=trg_max_len, dtype=trg_word.dtype
        )

        with paddle.static.amp.fp16_guard():
            src_emb = self.src_word_embedding(src_word)
            src_pos_emb = self.src_pos_embedding(src_pos)
            src_emb = src_emb + src_pos_emb
            enc_input = F.dropout(src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb

            trg_emb = self.trg_word_embedding(trg_word)
            trg_pos_emb = self.trg_pos_embedding(trg_pos)
            trg_emb = trg_emb + trg_pos_emb
            dec_input = F.dropout(trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb

            dec_output = self.transformer(
                enc_input,
                dec_input,
                src_mask=src_slf_attn_bias,
                tgt_mask=trg_slf_attn_bias,
                memory_mask=trg_src_attn_bias,
            )

            predict = self.linear(dec_output)

        return predict


查看模型

In [8]:
model = TransformerModel(
    src_vocab_size=10000,
    trg_vocab_size=10000,
    max_length=256 + 1,
    num_encoder_layers=6,
    num_decoder_layers=6,
    n_head=8,
    d_model=512,
    d_inner_hid=2048,
    dropout=0.1,
    weight_sharing=False,
    bos_id=0,
    eos_id=1)
# print(model)


W1229 08:10:04.669833 73320 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 12.2, Runtime API Version: 11.7
W1229 08:10:04.677749 73320 gpu_resources.cc:91] device: 0, cuDNN Version: 8.5.


<center><img src="https://ai-studio-static-online.cdn.bcebos.com/fb181b57c2d347b884502d5d11d8c61e918ee803069d4e26bcf4c6533cf948c6" width="1000" height="500" ></center>


下载预训练模型

In [9]:
# 下载预训练模型
!bash ./helper/get_data_and_model.sh

Over.


## 4、训练模型


In [10]:
# 可视化
from visualdl import LogWriter
logwriter = LogWriter(logdir='./visualdl/transformer')

In [11]:
def do_train(args):
    if args.use_gpu:
        place = "gpu"
    else:
        place = "cpu"
    paddle.set_device(place)
    # Set seed for CE
    random_seed = eval(str(args.random_seed))
    if random_seed is not None:
        paddle.seed(random_seed)

    # Define data loader
    (train_loader), (eval_loader) = create_data_loader(args)

    # Define model
    transformer = TransformerModel( # 用于训练
        src_vocab_size=args.src_vocab_size,
        trg_vocab_size=args.trg_vocab_size,
        max_length=args.max_length + 1,
        num_encoder_layers=args.n_layer,
        num_decoder_layers=args.n_layer,
        n_head=args.n_head,
        d_model=args.d_model,
        d_inner_hid=args.d_inner_hid,
        dropout=args.dropout,
        weight_sharing=args.weight_sharing,
        bos_id=args.bos_idx,
        eos_id=args.eos_idx)

    # Define loss
    criterion = CrossEntropyCriterion(args.label_smooth_eps, args.bos_idx)

    scheduler = paddle.optimizer.lr.NoamDecay(
        args.d_model, args.warmup_steps, args.learning_rate, last_epoch=0)

    # Define optimizer
    optimizer = paddle.optimizer.Adam(
        learning_rate=scheduler,
        beta1=args.beta1,
        beta2=args.beta2,
        epsilon=float(args.eps),
        parameters=transformer.parameters())

    step_idx = 0

    # Train loop
    for pass_id in range(args.epoch):
        batch_id = 0
        for input_data in train_loader:

            (src_word, trg_word, lbl_word) = input_data

            logits = transformer(src_word=src_word, trg_word=trg_word)

            sum_cost, avg_cost, token_num = criterion(logits, lbl_word)
            
            # 计算梯度
            avg_cost.backward() 
            # 更新参数
            optimizer.step() 
            # 梯度清零
            optimizer.clear_grad() 

            if (step_idx + 1) % args.print_step == 0 or step_idx == 0:
                total_avg_cost = avg_cost.numpy()
                logger.info(
                    "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                    " ppl: %f " %
                    (step_idx, pass_id, batch_id, total_avg_cost,
                        np.exp([min(total_avg_cost, 100)])))
                logwriter.add_scalar("train_loss", value=total_avg_cost, step=step_idx+pass_id*(args.batch_size))
                logwriter.add_scalar("train_perplexity", value=np.exp([min(total_avg_cost, 100)]), step=step_idx+pass_id*(args.batch_size))

            if (step_idx + 1) % args.save_step == 0:
                # Validation
                transformer.eval()
                total_sum_cost = 0
                total_token_num = 0
                with paddle.no_grad():
                    for input_data in eval_loader:
                        (src_word, trg_word, lbl_word) = input_data
                        logits = transformer(
                            src_word=src_word, trg_word=trg_word)
                        sum_cost, avg_cost, token_num = criterion(logits,
                                                                  lbl_word)
                        total_sum_cost += sum_cost.numpy()
                        total_token_num += token_num.numpy()
                        total_avg_cost = total_sum_cost / total_token_num
                    logger.info("validation, step_idx: %d, avg loss: %f, "
                                " ppl: %f" %
                                (step_idx, total_avg_cost,
                                 np.exp([min(total_avg_cost, 100)])))
                    logwriter.add_scalar("valid_loss", value=total_avg_cost, step=step_idx+pass_id*(args.batch_size))
                    logwriter.add_scalar("valid_perplexity", value=np.exp([min(total_avg_cost, 100)]), step=step_idx+pass_id*(args.batch_size))
                transformer.train()

                if args.save_model:
                    model_dir = os.path.join(args.save_model,
                                             "step_" + str(step_idx))
                    if not os.path.exists(model_dir):
                        os.makedirs(model_dir)
                    paddle.save(transformer.state_dict(),
                                os.path.join(model_dir, "transformer.pdparams"))
                    paddle.save(optimizer.state_dict(),
                                os.path.join(model_dir, "transformer.pdopt"))
            batch_id += 1
            step_idx += 1
            scheduler.step()


    if args.save_model:
        model_dir = os.path.join(args.save_model, "step_final")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        paddle.save(transformer.state_dict(),
                    os.path.join(model_dir, "transformer.pdparams"))
        paddle.save(optimizer.state_dict(),
                    os.path.join(model_dir, "transformer.pdopt"))

In [12]:
# 读入参数
yaml_file = './helper/transformer.base.yaml'
with open(yaml_file, 'rt') as f:
    args = AttrDict(yaml.safe_load(f))
    pprint(args)

{'batch_size': 50,
 'beam_size': 5,
 'beta1': 0.9,
 'beta2': 0.997,
 'bos_idx': 0,
 'd_inner_hid': 2048,
 'd_model': 512,
 'dropout': 0.1,
 'eos_idx': 1,
 'epoch': 5,
 'eps': '1e-9',
 'infer_batch_size': 50,
 'init_from_params': 'trained_models/CWMT2021_step_345000/',
 'label_smooth_eps': 0.1,
 'learning_rate': 2.0,
 'max_length': 256,
 'max_out_len': 256,
 'n_best': 1,
 'n_head': 8,
 'n_layer': 6,
 'output_file': './data/train_dev_test/predict.txt',
 'pad_factor': 8,
 'predict_file': './data/train_dev_test/ccmt2019-news.zh2en.source_cut.txt',
 'print_step': 10,
 'random_seed': 'None',
 'save_model': 'model/transformer',
 'save_step': 20,
 'special_token': ['<s>', '<e>', '<unk>'],
 'src_vocab_fpath': './data/train_dev_test/vocab.ch.src',
 'src_vocab_size': 10000,
 'training_file': './data/train_dev_test/train.ch.bpe,./data/train_dev_test/train.en.bpe',
 'trg_vocab_fpath': './data/train_dev_test/vocab.en.tgt',
 'trg_vocab_size': 10000,
 'unk_idx': 2,
 'use_gpu': True,
 'validation_file'

In [13]:
do_train(args)

[32m[2023-12-29 08:10:08,575] [    INFO][0m - step_idx: 0, epoch: 0, batch: 0, avg loss: 10.515485,  ppl: 36882.218750 [0m
[32m[2023-12-29 08:10:09,026] [    INFO][0m - step_idx: 9, epoch: 0, batch: 9, avg loss: 10.504098,  ppl: 36464.628906 [0m
[32m[2023-12-29 08:10:09,455] [    INFO][0m - step_idx: 19, epoch: 0, batch: 19, avg loss: 10.464189,  ppl: 35038.000000 [0m
[32m[2023-12-29 08:10:09,502] [    INFO][0m - validation, step_idx: 19, avg loss: 10.472076,  ppl: 35315.468750[0m
[32m[2023-12-29 08:10:13,117] [    INFO][0m - step_idx: 29, epoch: 1, batch: 9, avg loss: 10.407406,  ppl: 33103.882812 [0m
[32m[2023-12-29 08:10:13,609] [    INFO][0m - step_idx: 39, epoch: 1, batch: 19, avg loss: 10.341732,  ppl: 30999.675781 [0m
[32m[2023-12-29 08:10:13,658] [    INFO][0m - validation, step_idx: 39, avg loss: 10.389604,  ppl: 32519.775391[0m
[32m[2023-12-29 08:10:16,296] [    INFO][0m - step_idx: 49, epoch: 2, batch: 9, avg loss: 10.278871,  ppl: 29110.976562 [0m
[

## 5、预测和评估

模型预测

In [14]:
def do_predict(args):
    if args.use_gpu:
        place = "gpu"
    else:
        place = "cpu"
    paddle.set_device(place)

    # Define data loader
    test_loader, to_tokens = create_infer_loader(args)

    # Define model
    transformer = InferTransformerModel( # 用于生成
        src_vocab_size=args.src_vocab_size,
        trg_vocab_size=args.trg_vocab_size,
        max_length=args.max_length + 1,
        num_encoder_layers=args.n_layer,
        num_decoder_layers=args.n_layer,
        n_head=args.n_head,
        d_model=args.d_model,
        d_inner_hid=args.d_inner_hid,
        dropout=args.dropout,
        weight_sharing=args.weight_sharing,
        bos_id=args.bos_idx,
        eos_id=args.eos_idx,
        beam_size=args.beam_size,
        max_out_len=args.max_out_len)

    # Load the trained model
    assert args.init_from_params, (
        "Please set init_from_params to load the infer model.")

    model_dict = paddle.load(
        os.path.join(args.init_from_params, "transformer.pdparams"))

    # To avoid a longer length than training, reset the size of position
    # encoding to max_length
    model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
        args.max_length + 1, args.d_model)
    transformer.load_dict(model_dict)

    # Set evaluate mode
    transformer.eval()

    f = open(args.output_file, "w")
    with paddle.no_grad():
        for (src_word, ) in test_loader:
            finished_seq = transformer(src_word=src_word)
            finished_seq = finished_seq.numpy().transpose([0, 2, 1])
            for ins in finished_seq:
                for beam_idx, beam in enumerate(ins):
                    if beam_idx >= args.n_best:
                        break
                    id_list = post_process_seq(beam, args.bos_idx, args.eos_idx)
                    word_list = to_tokens(id_list)
                    sequence = " ".join(word_list) + "\n"
                    f.write(sequence)
    f.close()

In [15]:
do_predict(args)

模型评估

> 预测结果中每行输出是对应行输入的得分最高的翻译，对于使用 BPE 的数据，预测出的翻译结果也将是 BPE 表示的数据，要还原成原始的数据（这里指 tokenize 后的数据）才能进行正确的评估

In [16]:
# 还原 predict.txt 中的预测结果为 tokenize 后的数据
! sed -r 's/(@@ )|(@@ ?$)//g' ./data/train_dev_test/predict.txt > ./data/train_dev_test/predict.tok.txt
# BLEU评估工具来源于 https://github.com/moses-smt/mosesdecoder.git
# 计算multi-bleu
! perl ./helper/mosesdecoder/scripts/generic/multi-bleu.perl ./data/train_dev_test/ccmt2019-news.zh2en.ref*.txt < ./data/train_dev_test/predict.tok.txt

BLEU = 28.73, 65.5/39.6/25.1/16.0 (BP=0.898, ratio=0.903, hyp_len=20687, ref_len=22902)
It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.
