# 1. Numpy RNN Cell step

In [1]:
import numpy as np

In [5]:
def rnncell_step(x_t, h_prev, Wx, Wh, bh):
    """
    input:
    x_t: [B, D_in]
    h_prev: [B, H]
    Wx: [D_in, H]
    Wh: [H, H]
    bh: [H]
    output:
    h_t: [B, H]
    """
    h_t = np.tanh(x_t @ Wx + h_prev @ Wh + bh)
    return h_t

In [6]:
B, T, D_in, H = 2, 5, 4, 8
np.random.seed(42)
X = np.random.randn(B, T, D_in).astype(np.float32)
Wx = np.random.randn(D_in, H).astype(np.float32)
Wh = np.random.randn(H, H).astype(np.float32)
bh = np.zeros(H, dtype=np.float32)

In [7]:
h = np.zeros((B, H), dtype=np.float32)
Hs = []
for t in range(T):
    h = rnncell_step(X[:, t, :], h, Wx, Wh, bh) #time 
    Hs.append(h)
H_all = np.stack(Hs, axis=1) # [B, T, H]
print(f"H_all shape {H_all.shape}")

H_all shape (2, 5, 8)


# 2. PyTorch

## 2.1 char level LM

In [8]:
import torch, torch.nn as nn, torch.nn.utils as nn_utils

In [9]:
text = "hello world! " * 200
vocab = sorted(set(text))

In [12]:
vocab

[' ', '!', 'd', 'e', 'h', 'l', 'o', 'r', 'w']

In [14]:
stoi = {c:i for i, c in enumerate(vocab)}
itos = {i:c for c, i in stoi.items()}
ids = torch.tensor([stoi[c] for c in text], dtype=torch.long)

In [16]:
ids[:30]

tensor([4, 3, 5, 5, 6, 0, 8, 6, 7, 5, 2, 1, 0, 4, 3, 5, 5, 6, 0, 8, 6, 7, 5, 2,
        1, 0, 4, 3, 5, 5])

In [17]:
B, T = 16, 25

In [23]:
def get_batch():
    ix = torch.randint(0, len(ids) - T - 1, (B,)) #randint(low, high, size)
    x = torch.stack([ids[i:i+T] for i in ix])
    y = torch.stack([ids[i+1:i+T+1] for i in ix])
    return x, y

In [20]:
class CharRNNCellLM(nn.Module):
    def __init__(self, vocab_size, d_emb=32, hidden=64):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_emb)
        self.cell = nn.RNNCell(d_emb, hidden, nonlinearity="tanh")
        self.proj = nn.Linear(hidden, vocab_size)

    def forward(self, x): #x: [B,T]
        B, T = x.shape
        h = torch.zeros(B, self.cell.hidden_size, device=x.device)
        logits_all = []
        for t in range(T):
            e_t = self.emb(x[:, t]) # [B, d_emb]
            h = self.cell(e_t, h) #[B, H]
            logits_all.append(self.proj(h)) #[B,V]
        return torch.stack(logits_all, dim=1) #[B,T,V]

In [21]:
device = torch.device("mps")
model = CharRNNCellLM(len(vocab)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-3)
loss_fn = nn.CrossEntropyLoss()

In [25]:
for step in range(201):
    x, y = get_batch()
    x, y = x.to(device), y.to(device)
    logits = model(x)
    loss = loss_fn(logits.reshape(-1, logits.size(-1)), y.reshape(-1))
    opt.zero_grad()
    loss.backward()
    nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  #gradient clipping
    opt.step()
    if step % 50 == 0:
        print(f"[step {step}] loss={loss.item():.3f}")

[step 0] loss=2.278
[step 50] loss=0.038
[step 100] loss=0.027
[step 150] loss=0.016
[step 200] loss=0.024


In [26]:
#check after training

import torch.nn.functional as F

In [31]:
def generate_char_lm_torch(model, stoi, itos, prompt="hel", max_new_tokens=100, temperature=1.0, top_k=None, device="mps"):
    """
    model: the trained CharRNNCellLM
    stoi/itos: string<->id 
    prompt: starting chars
    temperature: temperature sampling，>1 更发散，<1 更确定
    top_k: 只在概率最高的 k 个字符里采样（None 关闭）
    """
    model.eval()

    #1) prompt preheat h
    h = torch.zeros(1, model.cell.hidden_size, device=device)
    for ch in prompt[:-1]:
        idx = torch.tensor([stoi[ch]], device=device)
        e = model.emb(idx)
        h = model.cell(e, h)

    #2) starting from last char of prompt and generating
    last_idx = torch.tensor([stoi[prompt[-1]]], device=device)
    generated = list(prompt)

    for _ in range(max_new_tokens):
        e = model.emb(last_idx)
        h = model.cell(e, h)
        logits = model.proj(h)
        logits = logits / max(temperature, 1e-8)

        if top_k is not None:
            vals, inds = torch.topk(logits, k=top_k, dim=-1)
            probs = F.softmax(vals, dim=-1)
            nxt = inds[0, torch.multinomial(probs[0], 1)]
        else:
            probs = F.softmax(logits, dim=-1)
            nxt = torch.multinomial(probs[0], 1)[0]

        generated.append(itos[int(nxt)])
        last_idx = nxt.view(1)

    return "".join(generated)

In [32]:
sample = generate_char_lm_torch(
    model, stoi, itos, 
    prompt="hel", max_new_tokens=120, 
    temperature=0.8, top_k=5, device=device
)
print("\n[PyTorch sample]\n", sample)


[PyTorch sample]
 hello world! hello world! hello world! hello world! hello world! hello world! hello world! hello world! hello world! hello 


## 2.2 Embedding -> RNN -> Last state -> Linear for sequence classification

In [33]:
corpus = [
    "this movie is great and excellent",
    "fantastic film wonderful direction",
    "good plot amazing soundtrack",
    "touching story strong performances",
    "brilliant engaging narrative",
    "bad pacing awful movie",
    "boring film dull characters",
    "terrible editing horrible dialogue",
    "predictable script poor scenes",
    "unwatchable messy scenes weak plot",
]
labels = torch.tensor([1,1,1,1,1, 0,0,0,0,0], dtype=torch.long)

In [34]:
import re

In [35]:
def tok(s):
    return re.findall(r"[a-z]+", s.lower())

In [36]:
vocab = sorted({w for s in corpus for w in tok(s)})
stoi = {w:i+1 for i, w in enumerate(vocab)}
pad = 0

In [37]:
def encode(s, T=10):
    ids = [stoi.get(w, 0) for w in tok(s)][:T]
    return torch.tensor(ids + [pad] * (T - len(ids)), dtype=torch.long)

In [38]:
X = torch.stack([encode(s) for s in corpus]) #[N, T]

In [41]:
class SentRNN(nn.Module):
    def __init__(self, V, d_emb=64, hidden=64, num_classes=2):
        super().__init__()
        self.emb = nn.Embedding(V + 1, d_emb, padding_idx=pad)
        self.rnn = nn.RNN(d_emb, hidden, batch_first=True)
        self.fc = nn.Linear(hidden, num_classes)

    def forward(self, x):
        e = self.emb(x) # [B,T,d]
        out, hT = self.rnn(e) # hT: [1, B, H]
        return self.fc(hT.squeeze(0))

In [42]:
model = SentRNN(len(vocab)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-3)
ce = nn.CrossEntropyLoss()

In [46]:
X = X.to(device)
labels = labels.to(device)

In [47]:
for ep in range(50):
    logits = model(X)
    loss = ce(logits, labels)
    opt.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    if ep % 10 == 0:
        acc = (logits.argmax(1) == labels).float().mean().item()
        print(f"[ep {ep}] loss={loss.item():.3f} acc={acc:.2f}")

[ep 0] loss=0.705 acc=0.50
[ep 10] loss=0.026 acc=1.00
[ep 20] loss=0.001 acc=1.00
[ep 30] loss=0.000 acc=1.00
[ep 40] loss=0.000 acc=1.00


In [49]:
model.eval()
with torch.no_grad():
    logits_tr = model(X)                # [N, num_classes]
    y_pred_tr = logits_tr.argmax(1)
    train_acc  = (y_pred_tr == labels).float().mean().item()
print(f"[Eval] Train accuracy = {train_acc:.3f}")

[Eval] Train accuracy = 1.000


In [59]:
label_names = {0: "NEGATIVE", 1: "POSITIVE"}  # 按你labels的定义来
def predict(texts, T=10, return_proba=True):
    model.eval()
    device = next(model.parameters()).device
    X_new = torch.stack([encode(s, T=T) for s in texts]).to(device)
    with torch.no_grad():
        logits = model(X_new)                         # [B, 2]
        probs  = torch.softmax(logits, dim=1).cpu()   # [B, 2]
        pred   = probs.argmax(1).numpy()
    if return_proba:
        return pred, probs.numpy()
    return pred

samples = [
    "this film is wonderful and touching",
    "awful boring movie with dull characters"
]
pred, prob = predict(samples, T=10, return_proba=True)
for s, p, pr in zip(samples, pred, prob):
    print(f"[Predict] {s!r} -> {label_names[int(p)]}  probs={pr}")

[Predict] 'this film is wonderful and touching' -> POSITIVE  probs=[0.27460364 0.72539634]
[Predict] 'awful boring movie with dull characters' -> POSITIVE  probs=[7.4766442e-04 9.9925226e-01]
