In [2]:
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

# Load CNN/DailyMail
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Lấy subset nhỏ để train nhanh
train_data = dataset["train"].select(range(5000))
val_data   = dataset["validation"].select(range(500))

# Xây tokenizer đơn giản (từ vocab tự động)
from collections import Counter

def build_vocab(texts, max_size=10000):
    counter = Counter()
    for t in texts:
        counter.update(t.lower().split())
    vocab = {"<pad>":0,"<sos>":1,"<eos>":2,"<unk>":3}
    for word, _ in counter.most_common(max_size-len(vocab)):
        vocab[word] = len(vocab)
    return vocab

src_vocab = build_vocab(train_data["article"])
tgt_vocab = build_vocab(train_data["highlights"])
inv_tgt_vocab = {i:w for w,i in tgt_vocab.items()}

def encode(text, vocab, max_len=100, add_sos=False, add_eos=True):
    ids = []
    if add_sos: ids.append(vocab["<sos>"])
    for w in text.lower().split():
        ids.append(vocab.get(w, vocab["<unk>"]))
        if len(ids) >= max_len: break
    if add_eos: ids.append(vocab["<eos>"])
    ids = ids[:max_len]
    ids += [vocab["<pad>"]] * (max_len-len(ids))
    return ids


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install h

In [3]:
class CNNDataset(torch.utils.data.Dataset):
    def __init__(self, articles, summaries, src_vocab, tgt_vocab, max_src=100, max_tgt=30):
        self.src = [encode(a, src_vocab, max_src) for a in articles]
        self.tgt = [encode(s, tgt_vocab, max_tgt, add_sos=True) for s in summaries]

    def __len__(self): return len(self.src)
    def __getitem__(self, i):
        return torch.tensor(self.src[i]), torch.tensor(self.tgt[i])

train_dataset = CNNDataset(train_data["article"], train_data["highlights"], src_vocab, tgt_vocab)
val_dataset   = CNNDataset(val_data["article"], val_data["highlights"], src_vocab, tgt_vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)


In [4]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim*2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)
        hidden = hidden.permute(1,0,2).repeat(1,src_len,1)  # [batch,src_len,hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2) # [batch,src_len]
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, attention):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(hid_dim+emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim*2+emb_dim, output_dim)
        self.attention = attention

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        attn_weights = self.attention(hidden, encoder_outputs).unsqueeze(1)
        context = attn_weights.bmm(encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        output = torch.cat((output.squeeze(1), context.squeeze(1), embedded.squeeze(1)), dim=1)
        prediction = self.fc_out(output)
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc_out.out_features).to(src.device)
        enc_outputs, hidden = self.encoder(src)
        input_tok = tgt[:,0]
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input_tok, hidden, enc_outputs)
            outputs[:,t,:] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input_tok = tgt[:,t] if teacher_force else output.argmax(1)
        return outputs


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(tgt_vocab)
EMB_DIM = 128
HID_DIM = 256

encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
attn = Attention(HID_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, attn)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab["<pad>"])

for epoch in range(3):  # train ít epoch để demo
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        loss = criterion(output[:,1:].reshape(-1, output_dim), tgt[:,1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, loss={total_loss/len(train_loader):.4f}")


Epoch 1, loss=7.0416
Epoch 2, loss=6.5664
Epoch 3, loss=6.1883


In [6]:
def summarize(model, text, max_len=30):
    model.eval()
    src = torch.tensor(encode(text, src_vocab, 100)).unsqueeze(0).to(device)
    enc_outputs, hidden = model.encoder(src)
    input_tok = torch.tensor([tgt_vocab["<sos>"]]).to(device)
    result = []
    for _ in range(max_len):
        output, hidden = model.decoder(input_tok, hidden, enc_outputs)
        top1 = output.argmax(1)
        if top1.item() == tgt_vocab["<eos>"]:
            break
        result.append(inv_tgt_vocab.get(top1.item(), "<unk>"))
        input_tok = top1
    return " ".join(result)

test_article = val_data[0]["article"]
print("Article:", test_article[:400], "...")
print("Gold summary:", val_data[0]["highlights"])
print("Pred summary:", summarize(model, test_article))


Article: (CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don't know, but the fact that so many people can have a ...
Gold summary: Zully Broussard decided to give a kidney to a stranger .
A new computer program helped her donation spur transplants for six kidney patients .
Pred summary: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> . <unk> . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> . <unk>
