In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 加载 IMDb数据集（小版本）
dataset = load_dataset("imdb", split="train[:10%]")  # 只取10%训练数据，加速测试
test_dataset = load_dataset("imdb", split="test[:10%]")

print(f"Train samples: {len(dataset)}")
print(f"Test samples: {len(test_dataset)}")

Train samples: 2500
Test samples: 2500


In [3]:
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2500
})


In [4]:
for i in range(3):
    item = dataset[i]
    text = item["text"]
    label = item["label"]
    print(f"Sample {i}:")
    print("Text:", text)
    print("Label:", label)
    print("-" * 30)


Sample 0:
Text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and 

In [5]:
# 2. 建立词表
all_text = " ".join(dataset["text"])
words = all_text.lower().split()
word_counts = Counter(words)
vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.items())}  # +2为了留0给PAD，1给UNK
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

print(f"Vocab size: {len(vocab)}")

Vocab size: 55780


In [6]:
# 3. 定义编码函数
def encode(text):
    return [vocab.get(word, vocab["<UNK>"]) for word in text.lower().split()]

In [7]:
# 4. 准备数据Loader
def collate_fn(batch):
    texts, labels = zip(*batch)
    encoded = [torch.tensor(encode(text)) for text in texts]
    padded = pad_sequence(encoded, batch_first=True, padding_value=vocab["<PAD>"])
    labels = torch.tensor(labels)
    return padded, labels

train_data = list(zip(dataset["text"], dataset["label"]))
test_data  = list(zip(test_dataset["text"], test_dataset["label"]))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_data, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [8]:
# 5. 定义模型
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, h_n = self.rnn(x)
        out = h_n[-1]  # 取最后一个隐藏状态
        out = self.fc(out)
        return out

model = SentimentRNN(vocab_size=len(vocab), embedding_dim=128, hidden_size=128, output_size=2)
print(model)

SentimentRNN(
  (embedding): Embedding(55780, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)


In [9]:
# 6. 损失函数 & 优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
# 7. 训练模型
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")

Epoch 1: 100%|██████████| 40/40 [00:09<00:00,  4.31it/s]


Epoch [1/5], Loss: 0.0413


Epoch 2: 100%|██████████| 40/40 [00:08<00:00,  4.49it/s]


Epoch [2/5], Loss: 0.0009


Epoch 3: 100%|██████████| 40/40 [00:08<00:00,  4.77it/s]


Epoch [3/5], Loss: 0.0005


Epoch 4: 100%|██████████| 40/40 [00:09<00:00,  4.06it/s]


Epoch [4/5], Loss: 0.0003


Epoch 5: 100%|██████████| 40/40 [00:10<00:00,  3.69it/s]

Epoch [5/5], Loss: 0.0003
Training complete!





In [11]:
# 8. 测试模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 100.00%


## 参数量计算：RNN隐藏层

对于一层标准的 `nn.RNN(input_size, hidden_size)`，它包含以下可训练参数：

| 参数 | 尺寸 | 说明 |
|:---|:---|:---|
| $W_{ih}$ | (hidden_size, input_size) | 输入到隐藏层的权重 |
| $ W_{hh} $ | (hidden_size, hidden_size) | 隐藏层到隐藏层的权重 |
| $ b_{ih} $ | (hidden_size,) | 输入到隐藏层的偏置 |
| $ b_{hh} $ | (hidden_size,) | 隐藏到隐藏层的偏置 |

所以参数总量计算公式为：

$
\text{参数总量} = (hidden\_size \times input\_size) + (hidden\_size \times hidden\_size) + (hidden\_size) + (hidden\_size)
$

---

### 在本项目中：

- `input_size = 128`
- `hidden_size = 128`

代入公式：

$
\text{参数总量} = (128 \times 128) + (128 \times 128) + 128 + 128 = 33,024
$

---

### 小结

- $ W_{ih} $：128 × 128 = 16,384个参数
- $ W_{hh} $：128 × 128 = 16,384个参数
- $ b_{ih} $：128个参数
- $ b_{hh} $：128个参数

**总计：33,024个可训练参数。**

