In [7]:
import torch
import spacy
 
# from torchtext.data import Field,BucketIterator
# from torchtext.datasets import Multi30k

from functools import partial
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [6]:
# de_seq=spacy.load("de_core_news_sm")
# en_seq=spacy.load("en_core_web_sm")
 
# def de_tokenizer(text):
#     return [word.text for word in de_seq.tokenizer(text)]
 
# def en_tokenizer(text):
#     return [word.text for word in en_seq.tokenizer(text)]

# SRC=Field(tokenize=de_tokenizer,
#          init_token="<sos>",
#          eos_token="<eos>",
#          lower=True,
#          batch_first=True)
 
# TRG=Field(tokenize=en_tokenizer,
#          init_token="<sos>",
#          eos_token="<eos>",
#          lower=True,
#          batch_first=True)

# 自定义读取本地数据的方法
def read(src_path, tgt_path, is_predict=False):
    if is_predict:
        with open(src_path, 'r', encoding='utf8') as src_f:
            for src_line in src_f.readlines():
                src_line = src_line.strip()
                if not src_line:
                    continue
                yield {'src':src_line, 'tgt':''}
    else:
        with open(src_path, 'r', encoding='utf8') as src_f, open(tgt_path, 'r', encoding='utf8') as tgt_f:
            for src_line, tgt_line in zip(src_f.readlines(), tgt_f.readlines()):
                src_line = src_line.strip()
                if not src_line:
                    continue
                tgt_line = tgt_line.strip()
                if not tgt_line:
                    continue
                yield {'src':src_line, 'tgt':tgt_line}
 # 过滤掉长度 ≤min_len或者≥max_len 的数据            
def min_max_filer(data, max_len, min_len=0):
    # 1 for special tokens.
    data_min_len = min(len(data[0]), len(data[1])) + 1
    data_max_len = max(len(data[0]), len(data[1])) + 1
    return (data_min_len >= min_len) and (data_max_len <= max_len)


# 创建训练集、验证集的dataloader
def create_data_loader(args):
    train_dataset = load_dataset(read, src_path=args.training_file.split(',')[0], tgt_path=args.training_file.split(',')[1], lazy=False)
    dev_dataset = load_dataset(read, src_path=args.validation_file.split(',')[0], tgt_path=args.validation_file.split(',')[1], lazy=False)

    src_vocab = Vocab.load_vocabulary(
        args.src_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])
    trg_vocab = Vocab.load_vocabulary(
        args.trg_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])

    padding_vocab = (
        lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor
    )
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample['src'].split()
        target = sample['tgt'].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    # 训练集dataloader和验证集dataloader
    data_loaders = []
    for i, dataset in enumerate([train_dataset, dev_dataset]):
        dataset = dataset.map(convert_samples, lazy=False).filter(
            partial(min_max_filer, max_len=args.max_length))

        # BatchSampler: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/BatchSampler_cn.html
        batch_sampler = BatchSampler(dataset,batch_size=args.batch_size, shuffle=True,drop_last=False)
        
        # DataLoader: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html
        data_loader = DataLoader(
            dataset=dataset,
            batch_sampler=batch_sampler,
            collate_fn=partial(
                prepare_train_input,
                bos_idx=args.bos_idx,
                eos_idx=args.eos_idx,
                pad_idx=args.bos_idx),
                num_workers=0,
                return_list=True)
        data_loaders.append(data_loader)

    return data_loaders


def prepare_train_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by training into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])
    trg_word = word_pad([[bos_idx] + inst[1] for inst in insts])
    lbl_word = np.expand_dims(
        word_pad([inst[1] + [eos_idx] for inst in insts]), axis=2)

    data_inputs = [src_word, trg_word, lbl_word]

    return data_inputs


In [None]:
# 创建测试集的dataloader，原理步骤同上（创建训练集、验证集的dataloader）
def create_infer_loader(args):
    dataset = load_dataset(read, src_path=args.predict_file, tgt_path=None, is_predict=True, lazy=False)

    src_vocab = Vocab.load_vocabulary(
        args.src_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])
    trg_vocab = Vocab.load_vocabulary(
        args.trg_vocab_fpath,
        bos_token=args.special_token[0],
        eos_token=args.special_token[1],
        unk_token=args.special_token[2])

    padding_vocab = (
        lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor
    )
    args.src_vocab_size = padding_vocab(len(src_vocab))
    args.trg_vocab_size = padding_vocab(len(trg_vocab))

    def convert_samples(sample):
        source = sample['src'].split()
        target = sample['tgt'].split()

        source = src_vocab.to_indices(source)
        target = trg_vocab.to_indices(target)

        return source, target

    dataset = dataset.map(convert_samples, lazy=False)

    # BatchSampler: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/BatchSampler_cn.html
    batch_sampler = BatchSampler(dataset,batch_size=args.infer_batch_size,drop_last=False)
    
    # DataLoader: https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/io/DataLoader_cn.html
    data_loader = DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=partial(
            prepare_infer_input,
            bos_idx=args.bos_idx,
            eos_idx=args.eos_idx,
            pad_idx=args.bos_idx),
            num_workers=0,
            return_list=True)
    return data_loader, trg_vocab.to_tokens

def prepare_infer_input(insts, bos_idx, eos_idx, pad_idx):
    """
    Put all padded data needed by beam search decoder into a list.
    """
    word_pad = Pad(pad_idx)
    src_word = word_pad([inst[0] + [eos_idx] for inst in insts])

    return [src_word, ]

In [None]:
train_data,val_data,test_data=Multi30k.splits(exts=(".de",".en"),
                                             fields=(SRC,TRG))
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)

batch=128
device=torch.device("cpu")#"cuda" if torch.cuda.is_available() else 
 
train_iter,val_iter,test_iter=BucketIterator.splits(
    (train_data,val_data,test_data),
    device=device,
    batch_size=batch
)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self,src_vocab_size,emb_size,hid_size,kernel_size,n_layers,dropout=0.25,max_len=100):
        #src_vocab_size 德语词库大小
        #embe_size 嵌入维度
        #hidden_size 卷积层的隐藏维度
        #kernel_size 卷积核大小
        #n_layers 卷积的block层数
        super(Encoder,self).__init__()
        self.token_emb=nn.Embedding(src_vocab_size,emb_size)
        self.pos_emb=nn.Embedding(max_len,emb_size)
        
        self.emb2hid=nn.Linear(emb_size,hid_size)
        self.hid2emb=nn.Linear(hid_size,emb_size)
        
        self.convs=nn.ModuleList([
            nn.Conv1d(in_channels=hid_size,
                     out_channels=hid_size*2,
                     kernel_size=kernel_size,
                     padding=(kernel_size-1)//2)
            for _  in range(n_layers)
        ])
        self.dropout=nn.Dropout(dropout)
        self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)#其实是一个平均的过程
        
    def forward(self, src):
        #src[batch src_len]
        
        #产生位置序列
        batch_size=src.shape[0]
        src_len=src.shape[1]
        
        pos=torch.arange(0,src_len).to(device)
        #pos[src_len]
        pos=pos.unsqueeze(0).repeat(batch_size,1)
        #pos[batch src_len]
        #src[batch src_len]
        src_embed=self.token_emb(src)
        pos_embed=self.pos_emb(pos)
        #src[batch src_len emb_size]
        #pos[batch src_len emb_size]
        #词嵌入添加位置编码
        src_pos_embed=self.dropout(src_embed+pos_embed)
        
        #src_pos_embed[batch src_len emb_size]
        #转变维度使其进入卷积层
        conv_input=self.emb2hid(src_pos_embed)
        #conv_input[batch src_len hid_size]
        
        #注：1D卷积的输入shape为:[batch input_channel seq_len],input_channel为输入维度，
        #           输出shape为:[batch output_channel  (seq_len+2*padding-kernel_size)/stride+1]
        # 我们的padding=kernel_size-1//2(这样设计就是保住输入输出长度相同),stride=1，output_channel=2input_channel
        # 因此输出:[batch 2*input_channel seq_len]
        #所以首先先转变conv_input的shape
        
        conv_input=conv_input.permute(0,2,1)
        #conv_input[batch hid_size src_len]
        #进入卷积层
        for conv in self.convs:
            conved=conv(self.dropout(conv_input))
            #conved[batch hid_size*2 src_len]
            #输出为2*hid_size 是为了glu激活函数，其输出的维度是输入的一半
            conved=F.glu(conved,dim=1)
            #conved[batch hid_size src_len]
            
            #残差连接,防止网络退化 
            conved=(conved+conv_input)*self.scale
            #conved[batch hid_size src_len]
            #循环遍历，此卷积输出是下一次卷积的输入
            conv_input=conved
            #conv_input[batch hid_size src_len]
        
        #卷积结束：
        #conved[batch hid_size src_len]
        
        #转变shape
        conved=conved.permute(0,2,1)
        #conved[batch src_len hid_size]
        
        #转变维度，得到卷积向量,也是注意力机制的里面的k
        conved=self.hid2emb(conved)
        #conved[batch src_len emb_size]
        
        #残差连接，得到联合向量，也是注意力机制里面的v
        combined=(conved+src_pos_embed)*self.scale
        
        #返回卷积向量和联合向量
        return conved,combined

In [None]:
src_vocab_size=len(SRC.vocab)
trg_vocab_size=len(TRG.vocab)
 
emb_size=256
hid_size=512
kernel_size=3
n_layers=10

enModel=Encoder(src_vocab_size,emb_size,hid_size,kernel_size,n_layers).to(device)
conved,combined=enModel(src)
print(conved.shape,combined.shape)

In [None]:
class Attention(nn.Module):
    def __init__(self,emb_size,hid_size):
        #这里我默认了encoder与decoder的嵌入维度和隐层维度相同
        super(Attention,self).__init__()
        self.emb2hid=nn.Linear(emb_size,hid_size)
        self.hid2emb=nn.Linear(hid_size,emb_size)
        self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
    
    def forward(self,dec_conved,embedd,en_conved,en_combined):
        """
        注意力计算首先使用一个线性层改变Decoder传入的conved的隐藏维数为相同的嵌入维数。
        然后，再与嵌入（embedded）通过一个残差连接求和。然后，通过发现它与编码的卷积（conved）有多少“匹配”，然后再通过对编码的组合（combined）进行加权和，
        这样应用标准注意力计算。然后将其投影回隐藏的维度大小，并应用与注意力层初始输入（conved）的残差连接。
        """
        #embedd[batch trg_len emb_size]
        #dec_conved[batch hid_size trg_len]    Q（要加上词嵌入才算真正的Q）
        #en_conved[batch src_len emb_size]     K 
        #en_combined[batch src_len emb_size]   V
        
        #转变Q的shape，使其为[batch trg_len hid_size]
        dec_conved=dec_conved.permute(0,2,1)
        #dec_conved[batch trg_len hid_size]
        
        #改变其维度，使其与嵌入维度相同
        dec_conved_emb=self.hid2emb(dec_conved)
        #dec_conved_emb[batch trg_len emb_size]
        
        #与embedded嵌入求和
        Q=(dec_conved_emb+embedd)*self.scale
        #Q[batch trg_len emb_size]
        #en_conved[batch src_len emb_size]     K 
        
        #计算与每个k的匹配程度
        energy=torch.matmul(Q,en_conved.permute(0,2,1))
        #energy[batch trg_len src_len]
        a=F.softmax(energy,dim=2)
        #a[batch trg_len src_len]
        #en_combined[batch src_len emb_size]   V
        
        #得到权重以后计算其最终的向量
        context=torch.matmul(a,en_combined)
        #context[batch trg_len emb_size]
        
        #转变维度并加上卷积初始残差
        #context[batch trg_len emb_size]
        #dec_conved[batch trg_len hid_size]
        context=self.emb2hid(context)
        #context[batch trg_len hid_size]
        conved=(context+dec_conved)*self.scale
        #conved[batch trg_len hid_size]
        return conved.permute(0,2,1),a

In [None]:
attModel=Attention(emb_size,hid_size).to(device)
#自己造一个dec的卷积向量
dec_conved=torch.randn(128,hid_size,26).to(device)
#自己构造一个词嵌入（带有位置信息）--trg
embedded=torch.randn(128,26,emb_size).to(device)
dec_conved,a=attModel(dec_conved,embedded,conved,combined)
print(dec_conved.shape,a.shape)

In [None]:
class Decoder(nn.Module):
    def __init__(self,trg_vocab_size,emb_size,hid_size,kernel_size,n_layers,attnModel,dropout=0.25,max_len=50):
        #trg_vocab_size 英语的词库大小
        #emb_size 嵌入维度
        #hid_size 隐层维度
        #kernel_size 卷积核大小
        #n_layers 卷积网络的层数
        #attnModel 注意力机制层
        super(Decoder,self).__init__()
        self.attnModel=attnModel
        self.kernel_size=kernel_size#要根据其在前面创建kernel-1个pad
        
        self.token_embed=nn.Embedding(trg_vocab_size,emb_size)
        self.pos_embed=nn.Embedding(max_len,emb_size)
        
        self.emb2hid=nn.Linear(emb_size,hid_size)
        self.hid2emb=nn.Linear(hid_size,emb_size)
        
        self.fc=nn.Linear(emb_size,trg_vocab_size)
        
        self.scale=torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.convs=nn.ModuleList([
            nn.Conv1d(in_channels=hid_size,
                     out_channels=2*hid_size,
                     kernel_size=kernel_size)
            for _ in range(n_layers)])
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,trg,en_conved,en_combined):
        #trg[batch trg_len]
        #en_conved[batch src_len emb_size]
        #en_combined[batch src_len emb_size]
        
        batch_size=trg.shape[0]
        trg_len=trg.shape[1]
        
        #位置编码
        pos=torch.arange(0,trg_len).to(device)
        #pos[trg_len]
        pos=pos.unsqueeze(0).repeat(batch_size,1)
        #pos[batch trg_len]
        
        #嵌入并求和
        token_embed=self.token_embed(trg)
        pos_embed=self.pos_embed(pos)
        #token_embed[batch trg_len emb_size]
        #pos_embed[batch trg_len emb_size]
        
        embedd=self.dropout(token_embed+pos_embed)
        #pos_embed[batch trg_len emb_size]
        
        #将embedd有emb_size维度转变为hid_size维度代入卷积层
        input_conv=self.emb2hid(embedd).permute(0,2,1)
        #input_conv[batch hid_size trg_len]
        hid_size=input_conv.shape[1]
        for _,conv in enumerate(self.convs):
            input_conv=self.dropout(input_conv)
            #对输入序列添加kernel_size的pad，防止翻译答案泄露
            padding=torch.ones(batch_size,hid_size,self.kernel_size-1).to(device)
            #padding[batch  hid_size kernel_size-1]
            #input_conv[batch  hid_size trg_len]
            pad_input_conv=torch.cat((padding,input_conv),dim=2)
            #pad_input_conv[batch hid_size trg_len+kernel_size-1]
            
            conved=conv(pad_input_conv)
            #conved[batch 2*hid_size trg_len]
            conved=F.glu(conved,dim=1)
            #conved[batch hid_size trg_len]
            conved,a=self.attnModel(conved,embedd,en_conved,en_combined)
            #conved[batch hid_size trg_len],a[batch trg_len src_len]
            
            #input_conv[batch hid_size trg_len]
            #残差连接
            conved=(conved+input_conv)*self.scale
            #conved[batch  hid_size trg_len]
            #带入下一层循环
            input_conv=conved
        
        #卷积层出来后
        #conved[batch  hid_size trg_len]
        #转变维度为emb_size
        output=self.hid2emb(conved.permute(0,2,1))
        #output[batch trg_len emb_size]
        #映射到英语字典空间上
        output=self.fc(self.dropout(output))
        return output,a

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
    
    def forward(self,src,trg):
        en_coved,en_combined=self.encoder(src)
        output,attn=self.decoder(trg,en_coved,en_combined)
        return output,attn

model=Seq2Seq(enModel,deModel).to(device)
output,a=model(src,trg)
print(output.shape)

In [None]:
import math,time
from torch.optim import Adam

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

epochs=10
clip=0.1
criterion=nn.CrossEntropyLoss(ignore_index=1)
optim=Adam(model.parameters())

def train(model,data_iter,criterion,optim,clip):
    
    model.train()
    lossAll=0
    for example in data_iter:
        src=example.src
        trg=example.trg
        
        optim.zero_grad()
        output,_=model(src,trg[:,:-1])
        #output[batch trg_len-1 trg_vocab_size]
        output=output.reshape(-1,trg_vocab_size)
        trg=trg[:,1:].reshape(-1)
        #output[batch*(trg_len-1),trg_vocab_size]
        #trg[batch*(trg_ken-1)]
        loss=criterion(output,trg)
        loss.backward()      
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        optim.step()
        
        lossAll+=loss.item()
    return lossAll/len(data_iter)

def evaluate(model,data_iter,criterion):
    
    model.eval()
    lossAll=0
    
    with torch.no_grad():
        for example in data_iter:
            src=example.src
            trg=example.trg
 
            output,_=model(src,trg[:,:-1])
            #output[batch trg_len-1 trg_vocab_size]
            output=output.reshape(-1,trg_vocab_size)
            trg=trg[:,1:].reshape(-1)
            #output[batch*(trg_len-1),trg_vocab_size]
            #trg[batch*(trg_ken-1)]
            loss=criterion(output,trg)
            lossAll+=loss.item()
    return lossAll/len(data_iter)

