In [None]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
from typing import Optional, Tuple
from torch import Tensor
from torch.autograd import Variable
import copy
import jieba
import torch.nn.functional as F


Past = Tuple[Tensor, Tensor]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out



class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).to(device)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.PE = pe
        self.register_buffer('pe', pe)

    def forward(self, x):

        x += Variable(self.pe[:, :x.size(1)],requires_grad=False)        
        return self.dropout(x)


class BaseAttention(nn.Module):
    """
    Tensor          Type            Shape
    ===========================================================================
    q               float           (..., query_len, dims)
    k               float           (..., kv_len, dims)
    v               float           (..., kv_len, dims)
    mask            bool            (..., query_len, kv_len)
    ---------------------------------------------------------------------------
    output          float           (..., query_len, dims)
    ===========================================================================
    """

    def __init__(self, dropout: float = 0.1, scale=True):
        super().__init__()
        self.scale = scale
        self.dropout = nn.Dropout(dropout)

    def forward(self,
                q: Tensor,
                k: Tensor,
                v: Tensor,
                mask: Optional[Tensor] = None) -> Tensor:

        x = torch.matmul(q, k.transpose(-2, -1))
        if self.scale: x = x / math.sqrt(k.size(-1))
        if mask is not None:
            x += mask.type_as(x) * x.new_tensor(-1e6)
        x = self.dropout(x.softmax(-1))

        return torch.matmul(x, v)


class MultiHeadAttention(BaseAttention):
    """
    Tensor          Type            Shape
    ===========================================================================
    q               float           (..., query_len, dims)
    k               float           (..., kv_len, dims)
    v               float           (..., kv_len, dims)
    mask            bool            (..., query_len, kv_len)
    ---------------------------------------------------------------------------
    output          float           (bs, query_len, dims)
    ===========================================================================
    """

    def __init__(self, heads: int, dropout: float = 0.1):
        super().__init__(dropout)
        self.heads = heads

    def forward(self,
                q: torch.Tensor,
                k: torch.Tensor,
                v: torch.Tensor,
                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Split the tensors to multi-heads.
        q = q.view(q.size()[:-1] + (self.heads, q.size(-1) // self.heads))  # [batch_size, query_len, heads, dim]
        k = k.view(k.size()[:-1] + (self.heads, k.size(-1) // self.heads))  # [batch_size, key_len, heads, dim]
        v = v.view(v.size()[:-1] + (self.heads, v.size(-1) // self.heads))  # [batch_size, key_len, heads, dim]

        q = q.transpose(-3, -2)  # [batch_size, heads, query_len, dim]
        k = k.transpose(-3, -2)
        v = v.transpose(-3, -2)

        if mask is not None:
            mask = mask.unsqueeze(-3)  # [batch_size,1, query_len, key_len]

        # Calculate multi-headed attentions and merge them into one.
        return (super().forward(q, k, v, mask)
                .transpose(-3, -2)
                .contiguous()
                .view(q.size()[:-3] + (q.size(-2), v.size(-1) * self.heads)))


class AttentionLayer(nn.Module):
    """
    Tensor          Type            Shape
    ===========================================================================
    q               float           (..., query_len, dims)
    k               float           (..., kv_len, dims)
    v               float           (..., kv_len, dims)
    past (*)        float           (..., past_len, dims)
    mask            bool            (..., query_len, past_len + kv_len)
    ---------------------------------------------------------------------------
    output 1        float           (..., query_len, dims)
    output 2 (*)    float           (..., past_len + kv_len, dims)
    ===========================================================================
    """

    def __init__(self, heads: int, dims: int, dropout: float = 0.1):
        super().__init__()
        d_head, remainder = divmod(dims, heads)
        if remainder:
            raise ValueError(" incompatible `dims` and `heads` ")
        self.attn = MultiHeadAttention(heads, dropout)
        self.proj_q = nn.Linear(dims, dims)
        self.proj_k = nn.Linear(dims, dims)
        self.proj_v = nn.Linear(dims, dims)
        self.linear = nn.Linear(dims, dims)

    def forward(self,
                q: torch.Tensor,
                k: torch.Tensor,
                v: torch.Tensor,
                past: Optional[Past] = None,
                mask: Optional[Tensor] = None
                ) -> Tuple[torch.Tensor, Past]:
        q, k, v = self.proj_q(q), self.proj_k(k), self.proj_v(v)

        # Reuse attention keys and values by concatenating to the current ones.
        if past is not None:
            k = torch.cat((past[0], k), dim=-2)
            v = torch.cat((past[1], v), dim=-2)

        x = self.linear(self.attn(q, k, v, mask))
        return x, (k, v)


class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        
        self.self_multihead = AttentionLayer(heads,d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        att_out, _ = self.self_multihead(embeddings, embeddings, embeddings, mask=mask)
        interacted = self.dropout(att_out)
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded



class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.dec_layer_norm = nn.LayerNorm(d_model)
        self.self_multihead = AttentionLayer(heads, d_model)
        self.src_multihead = AttentionLayer(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        att_target_out, _ = self.self_multihead(embeddings, embeddings, embeddings, mask=target_mask)
        query = self.dropout(att_target_out)
        query = self.layernorm(query + embeddings)
        en_den_att, _ = self.src_multihead(query, encoded, encoded, mask=src_mask)
        interacted = self.dropout(en_den_att)
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.dec_layer_norm(feed_forward_out + interacted)
        return decoded



class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, src_vocab_size,target_vocab_size):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = target_vocab_size
        # self.embed = nn.Embedding(vocab_size, d_model)
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(target_vocab_size, d_model)
        self.srcpe = PositionalEncoding(d_model,0, src_vocab_size)
        self.tgtpe = PositionalEncoding(d_model,0, target_vocab_size)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        self.src_masking = PadMasking(pad_or_ahead="pad",idx=0)
        self.tgt_masking = PadMasking(pad_or_ahead="ahead",idx=0)
        # self.cross_masking = PadMasking(pad_or_ahead="ahead",idx=0)
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.src_embed(src_words) * math.sqrt(self.d_model)
        src_embeddings = self.srcpe(src_embeddings)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.tgt_embed(target_words) * math.sqrt(self.d_model)
        tgt_embeddings = self.tgtpe(tgt_embeddings)
        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
        
    def forward(self, src_words, target_words,training=True):
        src_mask = self.src_masking(src_words) if training else None
        target_mask = self.tgt_masking(target_words) if training else None
        
        encoded = self.encode(src_words, src_mask)
        # print(encoded.shape)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = -1)
        return out



class PadMasking(nn.Module):
    def __init__(self, pad_or_ahead="ahead", idx=0):
        """
        :param pad_or_ahead: 选择哪种masking的方式 只有以下两种方式
        """
        super(PadMasking, self).__init__()
        self.pad_or_ahead = pad_or_ahead
        self.idx = idx

    def create_padding_mask(self, x: torch.Tensor, idx=0):
        """
        input shape: [batch_size, seq_len]
        return [batch_size,1,1,seq_len]
        :param idx token 为PAD的id值
        """
        # zeros = torch.zeros_like(x)
        mask = torch.eq(x, idx).type(torch.float32).to(x.device)
        return mask[:, None, :]

    def create_look_ahead_mask(self, x, idx=0):
        """
        input_shape:[batch_size, seq_len]
        return : [batch_size, 1, seq_len, seq_len]
        掩盖后面的token
        """
        seq_len = x.shape[1]
        look_ahead_mask = 1 - torch.tril(torch.ones(seq_len, seq_len), diagonal=0)
        look_ahead_mask = look_ahead_mask.to(x.device)
        padding_mask = self.create_padding_mask(x, idx)
        return torch.maximum(look_ahead_mask, padding_mask)

    def forward(self, x):
        if self.pad_or_ahead == "pad":
            return self.create_padding_mask(x, self.idx)
        elif self.pad_or_ahead == "ahead":
            return self.create_look_ahead_mask(x, self.idx)

In [11]:
class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
        
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
        
    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()   


class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="none")
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        
        mask = (target > 0).float()       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import jieba
import json
import os
import yaml
def load_yaml_text(file_path):
   
    with open(file_path) as f:
        lines = f.read()
    lines = yaml.safe_load(lines)
    return lines


def load_all_data(directory):
    
    file_names = os.listdir(directory)
    questions, answers = [], []
    for f_name in file_names:
        if f_name.endswith(".yml"):
            file_path = os.path.join(directory, f_name)
            conversions = load_yaml_text(file_path)['conversations']
            ques = [i[0] for i in conversions]
            ans = [i[1] for i in conversions]
            questions.extend(ques)
            answers.extend(ans)
    print("load data finished,the size of data is ", len(questions))
    questions = [" ".join(jieba.cut(str(s), cut_all=False, HMM=True)) for s in questions]
    answers = [" ".join(jieba.cut(str(s), cut_all=False, HMM=True)) for s in answers]
    return questions, answers

#questions, answers = load_all_data("../chatterbot-corpus/chatterbot_corpus/data/chinese")

In [1]:
import numpy as np
# import nltk

def load_trans_data(data_dir):
    with open(data_dir) as f:
        lines_train = f.read().splitlines()
    source = [l.split("\t")[0] for l in lines_train]
    target = [l.split("\t")[1] for l in lines_train]
    return source,target

def ja_zh_data(data_file):
    with open(data_file) as f:
        lines = f.read().splitlines()
    return lines
    
# train_en,train_zh = load_trans_data("./data/train.txt")
# test_en,test_zh = load_trans_data("./data/test.txt")
# dev_en,dev_zh = load_trans_data("./data/dev.txt")


# train_en = [" ".join(nltk.word_tokenize(l)).lower() for l in train_en]
# test_en = [" ".join(nltk.word_tokenize(l)).lower() for l in test_en]
# dev_en = [" ".join(nltk.word_tokenize(l)).lower() for l in dev_en]

# train_zh = [" ".join(jieba.cut(str(s), cut_all=False, HMM=True)) for s in train_zh]
# test_zh = [" ".join(jieba.cut(str(s), cut_all=False, HMM=True)) for s in test_zh]
# dev_zh = [" ".join(jieba.cut(str(s), cut_all=False, HMM=True)) for s in dev_zh]

train_ja = ja_zh_data("../translation/data/finall/train.ja")
train_zh = ja_zh_data("../translation/data/finall/train.zh")

test_ja = ja_zh_data("../translation/data/finall/test.ja")
test_zh = ja_zh_data("../translation/data/finall/test.zh")

print("train===")
print(train_ja[:3])
print(train_zh[:3])
print("test====")
print(test_ja[:3])
print(test_zh[:3])

src_en = train_ja + test_ja 
target_zh = train_zh + test_zh 
print("src_len is equal target_len:",len(src_en) == len(target_zh))

train===
['\ufeff す', '興奮', '大佐']
['\ufeff 的', '兴奋', '上校']
test====
['天下 静謐 の ため \u3000 一層 励む よう に と 。 ', '勅命 を 頂い た の じゃ 。 \u3000 戦 の 勅命 を ！ ', '永禄 １３ 年 ４ 月 \u3000 織田 信長 は 諸国 の 兵 を 従え ']
['为了 天下 静谧 要 多加 努力 ', '我 得到 了 敕命 开战 的 敕命 ', '1570 年 4 月 织田信长 统领 诸国 军队 ']
src_len is equal target_len: True


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer,tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import json
import torch
import jieba
with open("../translation/data/en-zh/en.txt") as f:
    zimu_src_en = f.read().splitlines()
with open("../translation/data/en-zh/zh.txt") as f:
    zimu_target_zh = f.read().splitlines()
print(zimu_src_en[:3])
print(zimu_target_zh[:3])
zimu_src_en = [" ".join(nltk.word_tokenize(i.lower())) for i in zimu_src_en]
zimu_target_zh = [" ".join(jieba.cut(str(s.replace(" ","")), cut_all=False, HMM=True)) for s in zimu_target_zh]
# print(target_zh[:3])

# train_en = src_en[:4000]
# train_zh = target_zh[:4000]
# test_en = src_en[-1000:]
# test_zh = target_zh[-1000:]
# with open("../tmp/token/words_zh.json") as f:
#     data = json.load(f)
#     tokenizer_zh = tokenizer_from_json(data)

# with open("../tmp/token/words_en.json") as f:
#     data1 = json.load(f)
#     tokenizer_en = tokenizer_from_json(data1)

# id2word =tokenizer_zh.index_word
print("load data finished..")

["there 's hundreds of them . shall we go get them ?", 'yeah . this will be fun .', 'yeah , absolutely .']
['得 有 几百只 吧 我们 去 抓 吧', '是 的 会 很 有趣 的', '肯定 的']
load data finished..


In [14]:
zimu_target_zh[:3]

['得 有 几百只 吧 我们 去 抓 吧', '是 的 会 很 有趣 的', '肯定 的']

In [None]:
len(tokenizer_en.word_index)

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
with open("../translation/data/en-zh/wmt19.en") as f:
    src_en = f.read().splitlines()
with open("../translation/data/en-zh/wmt19.zh") as f:
    target_zh = f.read().splitlines()
# src_en.extend(zimu_src_en)
# target_zh.extend(zimu_target_zh)
print(src_en[:3])
print(target_zh[:3])
# 生成英文的token
num_words = 2 ** 13
oov_token = '<UNK>'
tokenizer_en = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer_en.fit_on_texts(src_en)
VOCAB_SIZE_EN = len(tokenizer_en.word_index)
print("en vocab_size is :{}".format(VOCAB_SIZE_EN))
token_en_json = tokenizer_en.to_json()

with open("./token/words_en.json", 'w', encoding='utf-8') as f:
    json.dump(token_en_json, f, ensure_ascii=False)  # 存为json对象
    print("finished save en token")
    


#生成中文的token
num_words = 2 ** 13
oov_token = '<UNK>'
tokenizer_zh = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer_zh.fit_on_texts(target_zh)
VOCAB_SIZE_ZH = len(tokenizer_zh.word_index)
print("zh vocab_size is :{}".format(VOCAB_SIZE_ZH))
token_zh_json = tokenizer_zh.to_json()

with open("./token/words_zh.json", 'w', encoding='utf-8') as f:
    json.dump(token_zh_json, f, ensure_ascii=False)  # 存为json对象
    print("finished save zh token")


['1929 or 1989 ?', 'paris – as the economic crisis deepens and widens , the world has been searching for historical analogies to help us understand what has been happening .', 'at the start of the crisis , many people likened it to 1982 or 1973 , which was reassuring , because both dates refer to classical cyclical downturns .']
['1929 年 还 是 1989 年 ?', '巴 黎 - 随 着 经 济 危 机 不 断 加 深 和 蔓 延 ， 整 个 世 界 一 直 在 寻 找 历 史 上 的 类 似 事 件 希 望 有 助 于 我 们 了 解 目 前 正 在 发 生 的 情 况 。', '一 开 始 ， 很 多 人 把 这 次 危 机 比 作 1982 年 或 1973 年 所 发 生 的 情 况 ， 这 样 得 类 比 是 令 人 宽 心 的 ， 因 为 这 两 段 时 期 意 味 着 典 型 的 周 期 性 衰 退 。']
zh vocab_size is :620666
finished save zh token


In [11]:
tokenizer_en.word_index['our']

In [None]:
max_len=40
#英文句子id化
START_TOKEN_EN = len(tokenizer_en.word_index) + 1
END_TOKENN_EN = len(tokenizer_en.word_index) + 2
VOCAB_SIZE_EN = len(tokenizer_en.word_index) + 3
tokenized_inputs = tokenizer_en.texts_to_sequences(train_en)
#中文句子id化
START_TOKEN_ZH = len(tokenizer_zh.word_index) + 1
END_TOKENN_ZH = len(tokenizer_zh.word_index) + 2
VOCAB_SIZE_ZH = len(tokenizer_zh.word_index) + 3
tokenized_outputs = tokenizer_zh.texts_to_sequences(train_zh)

# pad token sentences
tokenized_inputs = [[START_TOKEN_EN] + i + [END_TOKENN_EN] for i in tokenized_inputs]
tokenized_outputs = [[START_TOKEN_ZH] + i + [END_TOKENN_ZH] for i in tokenized_outputs]

tokenized_inputs = pad_sequences(tokenized_inputs, maxlen=max_len, padding="post", truncating="post")
tokenized_outputs = pad_sequences(tokenized_outputs, maxlen=max_len, padding="post", truncating="post")

In [18]:
END_TOKENN_EN

24811

In [15]:
from torch.utils.data import Dataset, DataLoader
class DataProcesser(Dataset):
    def __init__(self, first_seg, sencond_seg):
        super(DataProcesser, self).__init__()
        self.first_seg = first_seg
        self.sencond_seg = sencond_seg

    def __len__(self):
        return len(self.sencond_seg)

    def __getitem__(self, item):
        seg1 = self.first_seg[item]

        seg2 = self.sencond_seg[item]

        return (torch.tensor(seg1, dtype=torch.long),
                torch.tensor(seg2, dtype=torch.long))

dataset = DataProcesser(tokenized_inputs, tokenized_outputs)
dataloader = DataLoader(dataset, shuffle=True, batch_size=20)

In [5]:
d_model = 768
heads = 12
num_layers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# epochs = 10


    
transformer = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, src_vocab_size=VOCAB_SIZE_EN,target_vocab_size=VOCAB_SIZE_ZH)
transformer = nn.DataParallel(transformer)
transformer = transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
criterion = LossWithLS(VOCAB_SIZE_ZH, 0.1)
print("init model finished..")


NameError: name 'AdamWarmup' is not defined

In [23]:
def train(train_loader, transformer, criterion, epoch):
    
    transformer.train()
    sum_loss = 0
    count = 0

    for i, (question, reply) in enumerate(train_loader):
        
        samples = question.shape[0]

        # Move to device
        question = question.to(device)
        reply = reply.to(device)

        # Prepare Target Data
        reply_input = reply[:, :-1]
        reply_target = reply[:, 1:]


        # Get the transformer outputs
        out = transformer(question, reply_input, True)
        # print(out.shape)
        # print(out)
        # Compute the loss
        loss = criterion(out, reply_target)
        
        # Backprop
        transformer_optimizer.optimizer.zero_grad()
        loss.backward()
        transformer_optimizer.step()
        
        sum_loss += loss.item() * samples
        count += samples
        
        if i % 100 == 0:
            print("Epoch [{}][{}/{}]\tLoss: {:.3f}".format(epoch, i, len(train_loader), sum_loss/count))

EPOCHS=200
for epoch in range(EPOCHS):
    
    train(dataloader, transformer, criterion, epoch)
    
    state = {'epoch': epoch, 'transformer': transformer.module, 'transformer_optimizer': transformer_optimizer}
    if ((epoch +1) % 10 == 0 and epoch > 0)  or epoch == EPOCHS - 1:
        torch.save(state, './model/model_4layer768_' + str(epoch + 1) + '.pt')

In [38]:
from  tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import torch
device='cpu'
checkpoint = torch.load('/model_dir/model_6layer768_80.bin',map_location=torch.device('cpu'))
model = checkpoint['transformer']

with open("./token/words_zh.json") as f:
    data = json.load(f)
    tokenizer_zh = tokenizer_from_json(data)

with open("./token/words_en.json") as f:
    data1 = json.load(f)
    tokenizers_en = tokenizer_from_json(data1)

id2word = tokenizer_zh.index_word

In [5]:
# model
END_TOKENN_ZH

NameError: name 'END_TOKENN_ZH' is not defined

In [39]:
device='cpu'
START_TOKEN_EN = len(tokenizers_en.word_index) + 1
END_TOKEN_EN = len(tokenizers_en.word_index) + 2
START_TOKEN_ZH = len(tokenizer_zh.word_index) + 1
END_TOKEN_ZH= len(tokenizer_zh.word_index) + 2
# END_TOKEN_EN=END_TOKENN_EN
# END_TOKEN_ZH=END_TOKENN_ZH
def evaluate(sentence, model, tokenizer, max_len=64):
    model.eval()
    sentence = " ".join(nltk.word_tokenize(sentence.lower()))
    print(sentence)
  
    sentence = [START_TOKEN_EN] + tokenizers_en.texts_to_sequences([sentence])[0] + [END_TOKEN_EN]
    
    sentence = torch.tensor(sentence, dtype=torch.long).unsqueeze(dim=0).to(device)
    # print(sentence)
    output = torch.tensor([START_TOKEN_ZH], dtype=torch.long).unsqueeze(dim=0).to(device)
    # print(output)
    for i in range(max_len):
        size = output.shape[1]
        predictions = model(sentence,output,training=False)
       
        predictions = predictions[:, -1:, :]
        
        pred_id = torch.argmax(predictions, dim=-1)
       
        if pred_id.unsqueeze(0).item() == END_TOKEN_ZH:
            break
        output = torch.cat([output, pred_id], dim=-1)
        
    return output.squeeze(0)

def predict(sentence):
    predictions = evaluate(sentence, model, tokenizer_zh, max_len=40)
    predictions = predictions.cpu().numpy()
    # print(predictions)
    
    predic_senc = [id2word[i] for i in predictions if i <= len(tokenizer_zh.word_index) and i > 0]
    return " ".join(predic_senc)


In [1]:
predict("where are you?")

NameError: name 'predict' is not defined

In [1]:
train_en[10:20]

NameError: name 'train_en' is not defined

In [34]:
train_zh[10:20]

['上 到 山顶 上 之后',
 '风 突然 变大 了',
 '我 在 挪威 美丽 的 西海岸 上',
 '这里 峡湾 遍布 寒冷刺骨',
 '还有 身材 健硕 的 北欧 海盗',
 '我 一直 很 喜欢 这个',
 '伟大 国家 生产 的 海鲜',
 '所以 我 在 十二月份 来到 了 这里',
 '大厨 告诉 我 这个 时候 是 挪威 海鲜',
 '和 其他 美味佳肴 的 巅峰 时期']

In [76]:
predict("it's okay.")

it 's okay .


'好 的'

In [25]:
pred = [predict(i) for i in test_en[:500]]

In [30]:
print(test_en[:3])
print(pred[:3])
print(test_zh[:3])

[" -Oh, no, no, no. I don't think   -It's okay, it's okay.", " -No, really, I don't need a shower.   -It's it's it's okay.", " It's okay."]
['不 没关系', '不 我 需要 一个 时间', '没关系']
['- 哦 不 不 不 我 不 认为 ...- 没关系 没关系', '- 不 真的 我 不 需要 洗澡 -... 它 没关系', '没关系']


In [11]:
from fairseq.models.transformer import TransformerModel
trans = TransformerModel.from_pretrained(
  '../translation/model',
  checkpoint_file='checkpoint_best.pt',
  data_name_or_path='../data-bin/train.ja-zh',
  is_gpu=False
)
inputs = "天下 静謐 の ため 　 一層 励む よう に と 。"
print(trans.translate(inputs))

2022-11-10 19:28:01 | INFO | fairseq.file_utils | loading archive file ../translation/model
2022-11-10 19:28:02 | INFO | fairseq.tasks.translation | [ja] dictionary: 70912 types
2022-11-10 19:28:02 | INFO | fairseq.tasks.translation | [zh] dictionary: 69840 types
2022-11-10 19:28:06 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir

为了 天下@@ 太平 ， 为了 更加 踏实 。
