In [1]:
import torch
import re
import random

In [2]:
class EncoderGru(torch.nn.Module):
    def __init__(self,word_size,hidden_size):
        super(EncoderGru,self).__init__()
        self.hidden_size=hidden_size
        self.embedding_layer=torch.nn.Embedding(word_size,hidden_size)
        self.gru=torch.nn.GRU(hidden_size,hidden_size)
    
    def forward(self,input_vector,hidden):
        embedded=self.embedding_layer(input_vector)
        embedded=embedded.unsqueeze(1)
        out,hid=self.gru(embedded,hidden)
        return out,hid
    
    def init_hidden(self):
        return torch.zeros(1,1,self.hidden_size)

In [3]:
class AttenDecoder(torch.nn.Module):
    def __init__(self,word_szie,hidden_size):
        super(AttenDecoder,self).__init__()
        self.hidden_size=hidden_size
        self.embedding_layer=torch.nn.Embedding(word_szie,hidden_size)
        self.atten_layer=torch.nn.Linear(hidden_size*2,50)
        self.atten_combine_layer=torch.nn.Linear(hidden_size*2,hidden_size)
        self.gru=torch.nn.GRU(hidden_size,hidden_size)
        self.last_layer=torch.nn.Linear(hidden_size,word_szie)
        
    def forward(self,input_vector,hidden,encoder_output):
        embeded=self.embedding_layer(input_vector)
        contacted=torch.cat((embeded,hidden[0]),dim=1)
        atten=self.atten_layer(contacted)
        atten_apply=torch.mm(atten,encoder_output.view(-1,256))
        
        atten_in=torch.cat((embeded,atten_apply),dim=1)
        
        gru_in=self.atten_combine_layer(atten_in)
        gru_in=gru_in.unsqueeze(0)
        
        out,hid=self.gru(gru_in,hidden)
        
        out=self.last_layer(out[0])
        return out,hid

In [4]:
def read_pairs(path='F:/Github/machine_learn_record/pytorch/data/cmn.txt'):
    file=open(path,encoding='utf-8')
    content=file.read()
    pairs=[]
    for p in content.split('\n'):
        temp=p.split('\t')
        pairs.append(temp)
        
    #调整英语中的符号
    for p in pairs:
        es=p[0].lower().strip()
        es= re.sub(r"([.!?])", r" \1", es)
        es = re.sub(r"[^a-zA-Z.!?]+", r" ", es)
        p[0]=es
    #删除最后一行
    pairs.pop()
    return pairs

In [5]:
pairs=read_pairs()
pairs[1]

['hi .', '你好。']

In [6]:
class Lang:
    def __init__(self,name):
        self.name=name
        self.index2word={}
        self.word2index={0: "SOS", 1: "EOS"}
        self.word2count={}
        self.n_word=2
    
    def add_sentence(self,sentence):
        if self.name=='en':
            for w in sentence.split(' '):
                self.add_word(w)
        else:
            for w in sentence:
                self.add_word(w)
                
    def add_word(self,word):
        if word not in self.word2index:
            self.index2word[self.n_word]=word
            self.word2index[word]=self.n_word
            self.word2count[word]=1
            self.n_word+=1
        else:
            self.word2count[word]+=1

In [7]:
chinese=Lang('cn')
english=Lang('en')
for p in pairs:
    chinese.add_sentence(p[1])
    english.add_sentence(p[0])

In [8]:
def sentence2tensor(sentence,lang,device=torch.device("cuda")):
    idxs=[]
    if lang.name=='en':
        for w in sentence.split(' '):
            idxs.append(lang.word2index[w])
    else:
        for w in sentence:
            idxs.append(lang.word2index[w])
    idxs.append(1)
    tnr=torch.tensor(idxs,dtype=torch.long)
    return tnr.view(-1,1).to(device)

In [9]:
sentence2tensor(pairs[1][1],chinese)

tensor([[4],
        [5],
        [3],
        [1]], device='cuda:0')

In [99]:
def train(encoder,decoder,encoder_optimizer,decoder_optimizer,loss_f,inputs,outs):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    encoder_outs=torch.zeros((50,1,1,256),dtype=torch.float).to(device)
    for i in range(inputs.shape[0]):
        encoder_outs[i],hidden=encoder.forward(inputs[i],encoder.init_hidden().to(device))

    loss=0
    
    decoder_in=torch.tensor([0],dtype=torch.long).to(device)
    for i in range(outs.shape[0]):
        out,hidden=atten_decorder.forward(decoder_in,hidden,encoder_outs)
        topv, topi = out.data.topk(1)
        decoder_in=topi.squeeze(0).detach() if random.random()>0.5 else outs[i]
        temp=loss_f(out,outs[i])
        loss+=temp
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item()/outs.shape[0]

In [103]:
def model_train(encoder,atten_decorder):
    encoder_optimizer=torch.optim.Adam(encoder.parameters(),lr=0.0001)
    decoder_optimizer=torch.optim.Adam(atten_decorder.parameters(),lr=0.0001)
    loss_f=torch.nn.CrossEntropyLoss()
    datas=[random.choice(pairs) for i in range(15000)]
    loss1000=0
    for i,p in enumerate(datas):
        inputs= sentence2tensor(p[0],english)
        outs= sentence2tensor(p[1],chinese)
        loss=train(encoder,atten_decorder,encoder_optimizer,decoder_optimizer,loss_f,inputs,outs)
        loss1000+=loss
        if i%100==0:
            print('{} epoc avg loss is {}'.format(i,loss1000/100))
            loss1000=0

In [104]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder=EncoderGru(english.n_word,256)
atten_decorder=AttenDecoder(chinese.n_word,256)
encoder.to(device)
atten_decorder.to(device)

AttenDecoder(
  (embedding_layer): Embedding(3439, 256)
  (atten_layer): Linear(in_features=512, out_features=50, bias=True)
  (atten_combine_layer): Linear(in_features=512, out_features=256, bias=True)
  (gru): GRU(256, 256)
  (last_layer): Linear(in_features=256, out_features=3439, bias=True)
)

In [None]:
for i in range(100):
    model_train(encoder,atten_decorder)

0 epoc avg loss is 0.021716778495094995
100 epoc avg loss is 2.78499424045678
200 epoc avg loss is 2.6096014312828157
300 epoc avg loss is 2.84956206914461
400 epoc avg loss is 2.971385979074071
500 epoc avg loss is 2.689510003678125
600 epoc avg loss is 2.921340236159592
700 epoc avg loss is 2.7845103598009144
800 epoc avg loss is 2.5861078882851594
900 epoc avg loss is 2.8093954508984496
1000 epoc avg loss is 2.74596874925988
1100 epoc avg loss is 2.8888507730963973
1200 epoc avg loss is 2.9403924292130186
1300 epoc avg loss is 2.6315302677431394
1400 epoc avg loss is 2.8970615227239485
1500 epoc avg loss is 2.721724507754051
1600 epoc avg loss is 2.755487748127667
1700 epoc avg loss is 2.805469996816364
1800 epoc avg loss is 2.790406708165844
1900 epoc avg loss is 2.9991978410590203
2000 epoc avg loss is 2.7284819810766137
2100 epoc avg loss is 3.003541123858568
2200 epoc avg loss is 3.008586110763167
2300 epoc avg loss is 2.638329825702061
2400 epoc avg loss is 3.0474701714925225
2

In [182]:
def predict(p,ecncoder,atten_decorder):
    inputs= sentence2tensor(p[0],english)
    outs= sentence2tensor(p[1],chinese)
    encoder_outs=torch.zeros((50,1,1,256),dtype=torch.float).to(device)
    for i in range(inputs.shape[0]):
        encoder_outs[i],hidden=encoder.forward(inputs[i],encoder.init_hidden().to(device))

    decoder_in=torch.tensor([0],dtype=torch.long).to(device)
    rs=[]
    for i in range(50):
        out,hidden=atten_decorder.forward(decoder_in,hidden,encoder_outs)
        topv, topi = out.data.topk(1)
        decoder_in=topi.squeeze(0).detach()
        if topi[0].item()==1:
            break
        rs.append(chinese.index2word[topi[0].item()])
    print("in------>",p[0],"target------>",p[1],"predict--->","".join(rs))

In [191]:
for i in range(100):
    a=random.randint(0,10000)
    predict(pairs[a],encoder,atten_decorder)

in------> turn left at the next corner . target------> 下一个街角左转。 predict---> 把下下一點鐘。
in------> you must clear the table . target------> 你必须把桌子清理干净。 predict---> 你必须清桌桌子。
in------> what s wrong with your dog ? target------> 你的狗怎麼了？ predict---> 你的狗了？
in------> i ll remember you forever . target------> 我會永遠記住你的。 predict---> 我會得記得你的。
in------> i have no knife to cut with . target------> 我沒有刀子可用來切。 predict---> 我有任何上子。
in------> we re both reasonable people . target------> 我们是两个通情达理的人。 predict---> 我们是人人人。人。
in------> we gladly accept your offer . target------> 我们很高兴接受你的提议。 predict---> 我们人接受你的的的意。
in------> whose is it ? target------> 这是谁的？ predict---> 这是谁的？
in------> there is a large supermarket . target------> 有一個大型超市。 predict---> 有一大大。
in------> tom likes it hot . target------> 汤姆喜欢热的。 predict---> 汤姆喜歡熱。
in------> does she like oranges ? target------> 她喜歡柳橙嗎？ predict---> 我喜歡橙橙嗎？
in------> he sat on the bench . target------> 他坐在長凳上。 predict---> 他坐在地上。
in------> when do you need it by ? target

In [184]:
def translate(sentence,ecncoder,atten_decorder):
    inputs= sentence2tensor(sentence,english)
    encoder_outs=torch.zeros((50,1,1,256),dtype=torch.float).to(device)
    for i in range(inputs.shape[0]):
        encoder_outs[i],hidden=encoder.forward(inputs[i],encoder.init_hidden().to(device))

    decoder_in=torch.tensor([0],dtype=torch.long).to(device)
    rs=[]
    for i in range(50):
        out,hidden=atten_decorder.forward(decoder_in,hidden,encoder_outs)
        topv, topi = out.data.topk(1)
        decoder_in=topi.squeeze(0).detach()
        if topi[0].item()==1:
            break
        rs.append(chinese.index2word[topi[0].item()])
    print("in------>",sentence,"predict--->","".join(rs))

In [193]:
translate('i like baby .',encoder,atten_decorder)

in------> i like baby . predict---> 我喜歡了。
