# Encoder-Decoder Model

The **Encoder-Decoder framework** is a fundamental architecture in deep learning for tackling **sequence-to-sequence (Seq2Seq)** tasks. It is widely used in machine translation, text summarization, dialogue generation, speech recognition, and more. The core idea is to encode an input sequence into a fixed-length contextual representation and then decode it into a target sequence.

1. **Core Components**
**Encoder**
Converts the input sequence (e.g., a sentence, audio signal) into a **context vector** that captures high-level semantic information.

Common architectures: RNNs (LSTM, GRU), Transformer Encoder, or CNNs.
Process: Processes the input step-by-step (e.g., word-by-word) and compresses it into a fixed-length vector (often the final hidden state).
**Decoder**
Generates the target sequence (e.g., translated text, summary) based on the context vector from the encoder.

Common architectures: RNNs (LSTM) or Transformer Decoder.
Generation: Operates in an **autoregressive** manner, producing outputs step-by-step while relying on previous predictions.

2. **Workflow**
**Encoding Phase**
The encoder processes the input sequence sequentially, updating its hidden states.
The final hidden state (or aggregated states) becomes the **context vector**.
**Decoding Phase**
The decoder initializes with the context vector and generates the target sequence.
At each step, it produces an output token (e.g., a word) and updates its hidden state until an end-of-sequence token (e.g., <EOS>) is generated or a maximum length is reached.

6. **Pros & Cons**
**Pros**:
* Versatile for diverse Seq2Seq tasks.
* Attention and Transformer variants address long-sequence limitations.

**Cons**:
* Early RNN-based models suffered from vanishing gradients.
* Generation errors (e.g., repetition) may require post-hoc optimization (e.g., beam search).

In [2]:
import random
import os
from string import ascii_lowercase

def generate_word(min_len=3, max_len=12):
    """生成随机单词，包含重复字母"""
    length = random.randint(min_len, max_len)
    return ''.join(random.choices(ascii_lowercase, k=length))

def generate_dataset(num_samples=1000, output_dir="data"):
    """生成训练数据集"""
    os.makedirs(output_dir, exist_ok=True)
    
    seen = set()
    source_path = os.path.join(output_dir, "source_data.txt")
    target_path = os.path.join(output_dir, "target_data.txt")
    
    with open(source_path, 'w') as src_f, open(target_path, 'w') as tgt_f:
        generated = 0
        while generated < num_samples:
            # 生成原始单词（可能包含重复字母）
            word = generate_word()
            
            # 生成排序后的目标
            sorted_word = ''.join(sorted(word))
            
            # 排除重复案例
            if (word, sorted_word) not in seen:
                seen.add((word, sorted_word))
                src_f.write(f"{word}\n")
                tgt_f.write(f"{sorted_word}\n")
                generated += 1
                
                # 生成包含重复字母的变体
                if random.random() < 0.3:
                    dup_word = word + random.choice(word)
                    dup_sorted = ''.join(sorted(dup_word))
                    if (dup_word, dup_sorted) not in seen:
                        seen.add((dup_word, dup_sorted))
                        src_f.write(f"{dup_word}\n")
                        tgt_f.write(f"{dup_sorted}\n")
                        generated += 1

    print(f"Generated {generated * 2} samples in {output_dir}/")

# 使用示例（生成包含10,000个样本的数据集）
generate_dataset(num_samples=10000, output_dir='data')

Generated 20000 samples in data/


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

# 数据预处理
def build_vocab(data):
    special_tokens = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    chars = set()
    for line in data:
        chars.update(line)
    chars = sorted(list(chars))
    vocab = special_tokens + chars
    int2vocab = {i: c for i, c in enumerate(vocab)}
    vocab2int = {c: i for i, c in int2vocab.items()}
    return int2vocab, vocab2int

def process_data(source_path, target_path):
    with open(source_path, 'r', encoding='utf-8') as f:
        source_data = f.read().split('\n')
    with open(target_path, 'r', encoding='utf-8') as f:
        target_data = f.read().split('\n')
    
    source_int2vocab, source_vocab2int = build_vocab(source_data)
    target_int2vocab, target_vocab2int = build_vocab(target_data)
    
    # 转换序列并添加EOS
    source_sequences = []
    for line in source_data:
        seq = [source_vocab2int.get(c, source_vocab2int['<UNK>']) for c in line]
        source_sequences.append(seq)
    
    target_sequences = []
    for line in target_data:
        seq = [target_vocab2int.get(c, target_vocab2int['<UNK>']) for c in line]
        seq.append(target_vocab2int['<EOS>'])
        target_sequences.append(seq)
    
    return (source_int2vocab, source_vocab2int, 
            target_int2vocab, target_vocab2int, 
            source_sequences, target_sequences)

# 数据集类
class Seq2SeqDataset(Dataset):
    def __init__(self, source, target):
        self.source = source
        self.target = target
        
    def __len__(self):
        return len(self.source)
    
    def __getitem__(self, idx):
        return self.source[idx], self.target[idx]

# 数据预处理函数
def collate_fn(batch, src_pad, trg_pad):
    sources, targets = zip(*batch)  # sources: [src1, src2, ...] targets: [trg1, trg2, ...]
    src_lens = [len(s) for s in sources]    
    trg_lens = [len(t) for t in targets]
    
    # 按源序列长度排序
    sorted_indices = sorted(range(len(src_lens)), key=lambda i: -src_lens[i])
    sources = [sources[i] for i in sorted_indices]
    targets = [targets[i] for i in sorted_indices]
    src_lens = [src_lens[i] for i in sorted_indices]
    trg_lens = [trg_lens[i] for i in sorted_indices]
    
    # 填充序列
    max_src = max(src_lens)
    max_trg = max(trg_lens)
    
    padded_sources = []
    for s in sources:
        padded = s + [src_pad] * (max_src - len(s))
        padded_sources.append(padded)
    
    padded_targets = []
    for t in targets:
        padded = t + [trg_pad] * (max_trg - len(t))
        padded_targets.append(padded)
    
    return (torch.LongTensor(padded_sources).t().contiguous(),
            torch.LongTensor(padded_targets).t().contiguous(),
            src_lens,
            trg_lens)

# 编码器
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_lens):
        embedded = self.dropout(self.embedding(src))
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_lens)
        outputs, (hidden, cell) = self.rnn(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        return hidden, cell

# 解码器
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(0))
        return prediction, hidden, cell

# Seq2Seq模型
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, src_lens, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src, src_lens)
        
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
            
        return outputs

# 训练函数
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for _, (src, trg, src_lens, trg_lens) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg, src_lens)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# 参数设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128
ENC_EMB_DIM = 15
DEC_EMB_DIM = 15
HID_DIM = 50
N_LAYERS = 2
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
N_EPOCHS = 60
CLIP = 5
LEARNING_RATE = 0.001

# 数据加载
(source_int2vocab, source_vocab2int, 
 target_int2vocab, target_vocab2int, 
 source_sequences, target_sequences) = process_data('data/source_data.txt', 'data/target_data.txt')

dataset = Seq2SeqDataset(source_sequences, target_sequences)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE,
                        collate_fn=lambda x: collate_fn(x,  # [(src1, trg1), (src2, trg2), ..]
                                                       source_vocab2int['<PAD>'], 
                                                       target_vocab2int['<PAD>']))

# 初始化模型
enc = Encoder(len(source_vocab2int), ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(len(target_vocab2int), DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# 训练循环
for epoch in range(N_EPOCHS):
    train_loss = train(model, data_loader, optimizer, criterion, CLIP)
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

# 预测函数
def predict(model, src_sequence, src_vocab2int, trg_vocab2int, trg_int2vocab, max_len=20):
    model.eval()
    src = [src_vocab2int.get(c, src_vocab2int['<UNK>']) for c in src_sequence]
    src_tensor = torch.LongTensor(src).unsqueeze(1).to(device)
    src_len = [len(src)]
    
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor, src_len)
    
    trg_indexes = [trg_vocab2int['<GO>']]
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == trg_vocab2int['<EOS>']:
            break
    return ''.join([trg_int2vocab[i] for i in trg_indexes[1:-1]])

Epoch: 01 | Train Loss: 2.913
Epoch: 02 | Train Loss: 2.258
Epoch: 03 | Train Loss: 1.964
Epoch: 04 | Train Loss: 1.750
Epoch: 05 | Train Loss: 1.573
Epoch: 06 | Train Loss: 1.425
Epoch: 07 | Train Loss: 1.308
Epoch: 08 | Train Loss: 1.205
Epoch: 09 | Train Loss: 1.114
Epoch: 10 | Train Loss: 1.024
Epoch: 11 | Train Loss: 0.943
Epoch: 12 | Train Loss: 0.879
Epoch: 13 | Train Loss: 0.811
Epoch: 14 | Train Loss: 0.759
Epoch: 15 | Train Loss: 0.706
Epoch: 16 | Train Loss: 0.661
Epoch: 17 | Train Loss: 0.620
Epoch: 18 | Train Loss: 0.577
Epoch: 19 | Train Loss: 0.535
Epoch: 20 | Train Loss: 0.502
Epoch: 21 | Train Loss: 0.474
Epoch: 22 | Train Loss: 0.444
Epoch: 23 | Train Loss: 0.414
Epoch: 24 | Train Loss: 0.392
Epoch: 25 | Train Loss: 0.369
Epoch: 26 | Train Loss: 0.350
Epoch: 27 | Train Loss: 0.334
Epoch: 28 | Train Loss: 0.310
Epoch: 29 | Train Loss: 0.293
Epoch: 30 | Train Loss: 0.279
Epoch: 31 | Train Loss: 0.266
Epoch: 32 | Train Loss: 0.247
Epoch: 33 | Train Loss: 0.239
Epoch: 34 

In [16]:
# 示例预测
input_word = 'whoami'
prediction = predict(model, input_word, source_vocab2int, target_vocab2int, target_int2vocab)
print(f'Input: {input_word} => Prediction: {prediction}')

Input: whoami => Prediction: himow
