#### 数据集：Tatoeba项目的双语句子对 https://www.manythings.org/anki/

In [1]:
# 导入所需的库
import math
import spacy
import jieba
import random
from opencc import OpenCC

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings("ignore")


# 设置随机种子以确保结果可重复
random.seed(42)
torch.manual_seed(42)

# 定义特殊标记
SPECIAL_TOKENS = ['<pad>', '<bos>', '<eos>', '<unk>']

In [2]:
# 检查是否有可用的GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
# 数据预处理函数
cc = OpenCC('t2s')  # 繁体转简体

def preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    data = []
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            en, zh = parts[0], parts[1]
            zh = cc.convert(zh)  # 繁体转简体
            data.append((en, zh))
    
    return data

# 读取并预处理数据
data = preprocess_data('cmn.txt')

# 输出前5条数据
print("前5条数据:")
for item in data[:5]:
    print(item)

前5条数据:
('Hi.', '嗨。')
('Hi.', '你好。')
('Run.', '你用跑的。')
('Stop!', '住手！')
('Wait!', '等等！')


In [4]:
# 分割数据集函数
def split_data(data, test_ratio=0.1, val_ratio=0.1):
    # 先随机打乱整个数据集
    random.shuffle(data)
    
    # 计算分割点
    test_split = int(len(data) * (1 - test_ratio))
    val_split = int(len(data) * (1 - test_ratio - val_ratio))
    
    # 分割数据集
    train_data = data[:val_split]
    val_data = data[val_split:test_split]
    test_data = data[test_split:]
    
    return train_data, val_data, test_data

# 分割数据集
train_data, val_data, test_data = split_data(data)

print(f"训练集大小: {len(train_data)}")
print(f"验证集大小: {len(val_data)}")
print(f"测试集大小: {len(test_data)}")

训练集大小: 23927
验证集大小: 2991
测试集大小: 2991


In [5]:
def build_vocab(sentences, tokenizer):
    vocab = {}
    for token in SPECIAL_TOKENS:
        vocab[token] = len(vocab)
    for sentence in sentences:
        for word in tokenizer(sentence):
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

# 加载分词器
jieba.initialize()
spacy_en = spacy.load('en_core_web_sm')

# 构建词典
en_sentences = [pair[0] for pair in train_data]
zh_sentences = [pair[1] for pair in train_data]

en_vocab = build_vocab(en_sentences, lambda x: [token.text.lower() for token in spacy_en.tokenizer(x)])
zh_vocab = build_vocab(zh_sentences, jieba.lcut)

print(f"英文词典大小: {len(en_vocab)}")
print(f"中文词典大小: {len(zh_vocab)}")

# 反向词典，用于将索引转换回单词
inv_en_vocab = {v: k for k, v in en_vocab.items()}
inv_zh_vocab = {v: k for k, v in zh_vocab.items()}

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.309 seconds.
Prefix dict has been built successfully.


英文词典大小: 6521
中文词典大小: 11340


In [6]:
# 遍历中文词典
print("中文词典:")
for i, (k, v) in enumerate(zh_vocab.items()):
    if i >= 15:
        break
    print(f"Index: {v}, Word: {k}")

print("\n**************************")

# 遍历英文词典
print("英文词典:")
for i, (k, v) in enumerate(en_vocab.items()):
    if i >= 15:
        break
    print(f"Index: {v}, Word: {k}")

中文词典:
Index: 0, Word: <pad>
Index: 1, Word: <bos>
Index: 2, Word: <eos>
Index: 3, Word: <unk>
Index: 4, Word: 汤姆
Index: 5, Word: 应当
Index: 6, Word: 被
Index: 7, Word: 责备
Index: 8, Word: 。
Index: 9, Word: 我
Index: 10, Word: 将要
Index: 11, Word: 在
Index: 12, Word: 这里
Index: 13, Word: 待
Index: 14, Word: 几天

**************************
英文词典:
Index: 0, Word: <pad>
Index: 1, Word: <bos>
Index: 2, Word: <eos>
Index: 3, Word: <unk>
Index: 4, Word: tom
Index: 5, Word: deserves
Index: 6, Word: to
Index: 7, Word: be
Index: 8, Word: blamed
Index: 9, Word: .
Index: 10, Word: i
Index: 11, Word: am
Index: 12, Word: going
Index: 13, Word: stay
Index: 14, Word: here


In [7]:
class TranslationDataset(Dataset):
    def __init__(self, data, en_vocab, zh_vocab):
        self.data = data
        self.en_vocab = en_vocab
        self.zh_vocab = zh_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        en, zh = self.data[idx]
        en_tokens = [self.en_vocab.get(token.text.lower(), self.en_vocab['<unk>']) 
                     for token in spacy_en.tokenizer(en)]
        zh_tokens = [self.zh_vocab.get(token, self.zh_vocab['<unk>']) 
                     for token in jieba.lcut(zh)]
        
        en_tokens = [self.en_vocab['<bos>']] + en_tokens + [self.en_vocab['<eos>']]
        zh_tokens = [self.zh_vocab['<bos>']] + zh_tokens + [self.zh_vocab['<eos>']]
        
        return torch.tensor(en_tokens), torch.tensor(zh_tokens)

def collate_fn(batch):
    en_sequences, zh_sequences = zip(*batch)
    
    en_sequences = nn.utils.rnn.pad_sequence(en_sequences, padding_value=en_vocab['<pad>'], batch_first=True)
    zh_sequences = nn.utils.rnn.pad_sequence(zh_sequences, padding_value=zh_vocab['<pad>'], batch_first=True)
    
    return en_sequences, zh_sequences

# 创建数据集和数据加载器
train_dataset = TranslationDataset(train_data, en_vocab, zh_vocab)
test_dataset = TranslationDataset(test_data, en_vocab, zh_vocab)

BATCH_SIZE = 64  # 根据8GB显存调整

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, pin_memory=True)

In [8]:
# 遍历dataloader
for src, tgt in train_loader:
    print(src.shape)
    print(tgt.shape)
    print(f"src: {src[0]}")
    print(f"tgt: {tgt[0]}")
    break

torch.Size([64, 19])
torch.Size([64, 14])
src: tensor([   1,   34,   63,   21,  234, 5800,   68, 2918,   70,   34, 1769,  382,
           9,    2,    0,    0,    0,    0,    0])
tgt: tensor([   1,    9,   18,  113,  226, 3110,    9,   18, 7744,    8,    2,    0,
           0,    0])


In [9]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(TransformerModel, self).__init__()
        
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
        
    def forward(self, src, tgt):
        src_padding_mask = (src == en_vocab['<pad>']).to(device)
        tgt_padding_mask = (tgt == zh_vocab['<pad>']).to(device)
        
        src = self.src_embedding(src) * math.sqrt(self.d_model)
        tgt = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)
        
        src_mask = None
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        
        # Convert padding masks to float and apply masking
        if src_padding_mask is not None:
            src_padding_mask = src_padding_mask.float().masked_fill(src_padding_mask, float('-inf'))
        if tgt_padding_mask is not None:
            tgt_padding_mask = tgt_padding_mask.float().masked_fill(tgt_padding_mask, float('-inf'))
        
        output = self.transformer(src, tgt, src_mask, tgt_mask, 
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask)
        
        return self.fc_out(output)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(zh_vocab)
d_model = 256
nhead = 8
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 512
dropout = 0.1

# 初始化模型
model = TransformerModel(
    src_vocab_size,
    tgt_vocab_size,
    d_model=d_model,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout
).to(device)

model

TransformerModel(
  (src_embedding): Embedding(6521, 256)
  (tgt_embedding): Embedding(11340, 256)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, ele

In [11]:
def train(model, optimizer, criterion, dataloader, device):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, criterion, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            
            output = model(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, output.size(-1)), tgt[:, 1:].contiguous().view(-1))
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [12]:
from timeit import default_timer as timer

# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=zh_vocab['<pad>'])

# 训练循环
NUM_EPOCHS = 100
patience = 5  # 连续5个epoch验证损失没有改善就停止
best_val_loss = float('inf')
no_improvement = 0

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    
    train_loss = train(model, optimizer, criterion, train_loader, device)
    
    end_time = timer()
    
    val_loss = evaluate(model, criterion, test_loader, device)
    
    print(f"Epoch: {epoch}\tTrain loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "
          f"Epoch time = {(end_time - start_time):.3f}s")
    
    # 检查验证损失是否改善
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improvement = 0
    else:
        no_improvement += 1
    
    # 如果连续多个epoch没有改善，则停止训练
    if no_improvement >= patience:
        print(f"验证损失在连续{patience}个轮次内没有改善。提前停止训练。")
        break

Epoch: 1	Train loss: 5.493, Val loss: 4.995, Epoch time = 5.718s
Epoch: 2	Train loss: 4.485, Val loss: 4.657, Epoch time = 5.554s
Epoch: 3	Train loss: 4.199, Val loss: 4.410, Epoch time = 5.440s
Epoch: 4	Train loss: 3.984, Val loss: 4.320, Epoch time = 5.624s
Epoch: 5	Train loss: 3.806, Val loss: 4.171, Epoch time = 6.000s
Epoch: 6	Train loss: 3.656, Val loss: 4.126, Epoch time = 5.693s
Epoch: 7	Train loss: 3.523, Val loss: 4.019, Epoch time = 5.685s
Epoch: 8	Train loss: 3.406, Val loss: 3.946, Epoch time = 5.635s
Epoch: 9	Train loss: 3.294, Val loss: 3.892, Epoch time = 5.722s
Epoch: 10	Train loss: 3.193, Val loss: 3.819, Epoch time = 5.493s
Epoch: 11	Train loss: 3.097, Val loss: 3.767, Epoch time = 5.732s
Epoch: 12	Train loss: 3.006, Val loss: 3.723, Epoch time = 5.479s
Epoch: 13	Train loss: 2.920, Val loss: 3.676, Epoch time = 5.573s
Epoch: 14	Train loss: 2.838, Val loss: 3.600, Epoch time = 5.709s
Epoch: 15	Train loss: 2.760, Val loss: 3.568, Epoch time = 5.644s
Epoch: 16	Train los

In [13]:
# 保存最终模型及其他信息
torch.save({
    'epoch': NUM_EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': train_loss,
    'val_loss': val_loss,
}, 'transformer_zh_en.pth')

In [14]:
# 加载保存的模型
checkpoint = torch.load('transformer_zh_en.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()  # 设置为评估模式

TransformerModel(
  (src_embedding): Embedding(6521, 256)
  (tgt_embedding): Embedding(11340, 256)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, ele

In [15]:
def translate(model, src_sentence, max_length=50):
    model.eval()
    src_tokens = [en_vocab.get(token.text.lower(), en_vocab['<unk>']) 
                  for token in spacy_en.tokenizer(src_sentence)]
    src_tokens = [en_vocab['<bos>']] + src_tokens + [en_vocab['<eos>']]
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    
    tgt_tokens = [zh_vocab['<bos>']]
    for i in range(max_length):
        tgt_tensor = torch.LongTensor(tgt_tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)
        
        pred_token = output.argmax(2)[:, -1].item()
        tgt_tokens.append(pred_token)
        
        if pred_token == zh_vocab['<eos>']:
            break
    
    return ' '.join([inv_zh_vocab[token] for token in tgt_tokens[1:-1]])

In [20]:
test_datas = random.sample(val_data, 5)
test_datas

[("It's next to impossible to finish it in a day.", '在一天之内完成它几乎是不可能的。'),
 ("I don't eat oranges.", '我不吃橙子。'),
 ('They have been married two years.', '他们已经结婚两年了。'),
 ('She assumed an air of indifference.', '她假装不在意。'),
 ('Tom is our oldest son.', '汤姆是我们最大的儿子。')]

In [21]:
# 测试翻译
test_sentences = [pair[0] for pair in test_datas]

for sentence in test_sentences:
    translation = translate(model, sentence)
    print(f"英文: {sentence}")
    print(f"中文翻译: {translation}")
    print()

英文: It's next to impossible to finish it in a day.
中文翻译: 下 一次 有 可能 完成 它 。

英文: I don't eat oranges.
中文翻译: 我 不吃 吃 完 了 。

英文: They have been married two years.
中文翻译: 他们 已经 结婚 了 。

英文: She assumed an air of indifference.
中文翻译: 她 怕 了 一个 饥饿 的 我怕 。

英文: Tom is our oldest son.
中文翻译: 汤姆 是 我们 的 儿子 。

