In [2]:
import torch.nn as nn
import torch
import torch.optim as optim
import torch.functional as F
# from torchsummary import summary
from tqdm import tqdm
torch.__version__

'1.7.0'

In [3]:
import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
from spacy.lang.en import English
from spacy.lang.de import German
import random
import time
torchtext.__version__

'0.6.0'

# 数据处理

In [4]:
seed = 2022
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deteministic=True

In [5]:
spacy_de = German()
spacy_en = English()

In [6]:
# 将字符转换为list
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)][::-1]

In [7]:
SRC = Field(tokenize=tokenize_de,init_token="<sos>",eos_token="<eos>",lower=True)
TRG = Field(tokenize=tokenize_en,init_token="<sos>",eos_token="<eos>",lower=True)

In [8]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de','.en'),fields=(SRC,TRG))

In [9]:
print(f"Number of Training examples:{len(train_data.examples)}")
print(f"Number of Validation examples:{len(valid_data.examples)}")
print(f"Number of Testing examples:{len(test_data.examples)}")

Number of Training examples:29000
Number of Validation examples:1014
Number of Testing examples:1000


In [10]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']}


In [11]:
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)
print(f'Unique tokens in source(de) vocabulary: {len(SRC.vocab)}')
print(f'Unique tokens in source(en) vocabulary: {len(TRG.vocab)}')

Unique tokens in source(de) vocabulary: 7853
Unique tokens in source(en) vocabulary: 5893


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 参数设定

In [13]:
Batch_size = 128
train_iterator, valid_iterator,test_iterator = BucketIterator.splits([train_data,valid_data,test_data], batch_size=Batch_size,device=device)

In [14]:
for i, batch in enumerate(train_iterator):
    src = batch.src.to(device)
    trg = batch.trg.to(device)
print(src.shape,trg.shape)

torch.Size([27, 128]) torch.Size([28, 128])


# 建立模型

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hide_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.n_layers = n_layers
        self.hide_dim = hide_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hide_dim, n_layers, dropout=dropout,bidirectional=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)   
        return hidden, cell   #[batch, seq, dim]    

In [16]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hide_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.n_layers = n_layers
        self.hide_dim = hide_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hide_dim, n_layers, dropout=dropout,bidirectional=False)
        self.liner = nn.Linear(hide_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs, hidden, cell):
        inputs = inputs.unsqueeze(0)
        embedded = self.dropout(self.embedding(inputs))
        outputs, (hidden, cell) = self.lstm(embedded,(hidden, cell))
        predict = self.liner(outputs.squeeze(0))
        
        return predict, hidden, cell  #[batch, output_dim]  

In [17]:
class seq2seq(nn.Module):
    def __init__(self, input_dim, output_dim, enc_emb_dim,dec_emb_dim, hide_dim, n_layers, enc_dropout, dec_dropout, device):
        super(seq2seq, self).__init__()
        self.encoder = Encoder(input_dim, enc_emb_dim, hide_dim, n_layers, enc_dropout)
        self.decoder = Decoder(output_dim, dec_emb_dim, hide_dim, n_layers, dec_dropout)
        self.device = device
        assert self.encoder.hide_dim== self.decoder.hide_dim,"Hidden dimensions of encoder and decoder must be equal!"
        assert self.encoder.n_layers== self.decoder.n_layers,"encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        max_len =trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # 用于储存
        outputs = torch.zeros([max_len, batch_size,trg_vocab_size]).to(self.device)
        hidden, cell = self.encoder(src)
        # first input to the decoder is the <sos> token
        input = trg[0,:] # [seq_len,batch_size]
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output 
            # yes or np teacher_forcing
            teacher_force = random.random()
            topk = output.argmax(1)
            input = trg[t] if teacher_force < teacher_forcing_ratio else topk
        return outputs

In [18]:
input_dim, output_dim =len(SRC.vocab),len(TRG.vocab)
encoder_emb_dim, decoder_emb_dim = 256, 256
hidden_dim = 512
n_layers =2
encoder_dropout,decoder_dropout=0.5, 0.5
model = seq2seq(input_dim, output_dim, encoder_emb_dim,decoder_emb_dim, hidden_dim, n_layers, encoder_dropout, decoder_dropout, device).to(device)

In [19]:
# print(model)
# print(summary(model.cuda(),input_size=[(3,32,32),(3,32,32)],batch_size=-1))

In [20]:
#权重初始化
def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08,0.08)
model.apply(init_weights)

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (liner): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [21]:
optimizer_enc = optim.Adam(model.encoder.parameters(),lr=1e-4)
scheduler_enc = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer_enc, T_0=100, T_mult=2)
optimizer_dec = optim.Adam(model.decoder.parameters(),lr=1e-3)
scheduler_dec = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer_dec, T_0=100, T_mult=2)

In [22]:
PAD_Index = TRG.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_Index)

# 训练过程

In [23]:
def train(model, iterator, optimizer_enc, optimizer_dec,scheduler_enc,scheduler_dec,criterion, clip):
    model.train()
    epoch_loss= 0
    for i, batch in enumerate(iterator):
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        optimizer_enc.zero_grad()
        optimizer_dec.zero_grad()
        output = model(src,trg)
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer_enc.step()
        optimizer_dec.step()
        scheduler_enc.step()
        scheduler_dec.step()
        epoch_loss += loss.item()
    return epoch_loss/len(iterator)
        

# 验证

In [24]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        with torch.no_grad():
            src = batch.src.to(device)
            trg = batch.trg.to(device)
            output = model(src,trg)
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()   
    return epoch_loss/len(iterator)

In [25]:
epochs = 10
clip = 1
best_valid_loss = float('inf')
for epoch in range(1,epochs+1):
    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer_enc, optimizer_dec,scheduler_enc,scheduler_dec,criterion,clip)
    valid_loss = evaluate(model, valid_iterator, criterion,device)
    end_time = time.time()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(),"seq2seq.pt")
    print(f"Epoch:{epoch},Time:{end_time-start_time:3f}s")
    print(f"Train loss:{train_loss:.3f}  Valiadtion loss:{valid_loss:.3f}")  

Epoch:1,Time:41.176283s
Train loss:5.133  Valiadtion loss:4.669
Epoch:2,Time:41.264736s
Train loss:4.679  Valiadtion loss:4.425
Epoch:3,Time:41.171484s
Train loss:4.445  Valiadtion loss:4.291
Epoch:4,Time:41.781509s
Train loss:4.396  Valiadtion loss:4.136
Epoch:5,Time:41.169658s
Train loss:4.268  Valiadtion loss:4.145
Epoch:6,Time:41.633498s
Train loss:4.216  Valiadtion loss:4.134
Epoch:7,Time:41.029516s
Train loss:4.198  Valiadtion loss:3.999
Epoch:8,Time:42.728580s
Train loss:4.172  Valiadtion loss:3.973
Epoch:9,Time:41.626429s
Train loss:4.067  Valiadtion loss:4.042
Epoch:10,Time:42.822485s
Train loss:3.987  Valiadtion loss:4.092


In [26]:
model.load_state_dict(torch.load('seq2seq.pt'))
test_loss = evaluate(model, test_iterator, criterion,device)
print(f"Test loss:{train_loss:.3f}") 

Test loss:3.987
