## RNN

In this part, you will train a vanilla RNN language model on《论语》and evaluate its perplexity.

### 0. Import Necessary Libraries

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

prepare data 

In [6]:
input_file = '../data/lunyu_20chapters.txt'

from util import CorpusReader
corpus = CorpusReader(inputFileName=input_file, min_count=1)

Total vocabulary: 1352


In [7]:
### START YOUR CODE ###
# Modify word2id to make 0 as the padding token '[PAD]', and increase the index of all other words by 1
# Modify the id2word list to make the first word '[PAD]' as well
# Hint: Both word2id and id2word in utils.CorpusReader are dict objects
word2id = {}
word2id['[PAD]'] = 0
for word, id in corpus.word2id.items():
    word2id[word] = id + 1

id2word = {}
id2word[0] = '[PAD]'
for id, word in corpus.id2word.items():
    id2word[id + 1] = word
### END YOUR CODE ###


In [8]:

# Test result
print('id2word:', sorted(list(id2word.items()), key=lambda x: x[0])[:5])
print('word2id:', sorted(list(word2id.items()), key=lambda x: x[1])[:5])

# You should expect to see:
# id2word: [(0, '[PAD]'), (1, '，'), (2, '子'), (3, '。'), (4, '：')]
# word2id: [('[PAD]', 0), ('，', 1), ('子', 2), ('。', 3), ('：', 4)]


id2word: [(0, '[PAD]'), (1, '，'), (2, '子'), (3, '。'), (4, '：')]
word2id: [('[PAD]', 0), ('，', 1), ('子', 2), ('。', 3), ('：', 4)]


In [9]:
with open(input_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    max_len = max([len(line.strip()) for line in lines])
line_words = [list(line.strip()) for line in lines]
seq_ids = [torch.tensor([word2id.get(word, 0) for word in words]) for words in line_words]
seq_lens = torch.tensor([len(ids) for ids in seq_ids])
seq_ids_padded = nn.utils.rnn.pad_sequence(seq_ids, batch_first=True)
seq_ids_padded.size()

torch.Size([512, 393])

In [10]:
embedding_lunyu = nn.Embedding(len(word2id), 50) # vocab_size, embedding_dim
rnn_lunyu = nn.RNN(50, 100, batch_first=True)
seq_embs = embedding_lunyu(seq_ids_padded)
seq_embs_packed = nn.utils.rnn.pack_padded_sequence(seq_embs, seq_lens, batch_first=True, enforce_sorted=False)
out_packed, _ = rnn_lunyu(seq_embs_packed)
out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)

In [11]:

# Test result
print('max length: ', max_len)
print('seq_ids_padded:', seq_ids_padded.size())
print('seq_embs:', seq_embs.size())
print('out_unpacked:', out_unpacked.size())

# You should expect to see:
# seq_ids_padded: torch.Size([512, 393])
# seq_embs: torch.Size([512, 393, 50])
# out_unpacked: torch.Size([512, 393, 100])

max length:  393
seq_ids_padded: torch.Size([512, 393])
seq_embs: torch.Size([512, 393, 50])
out_unpacked: torch.Size([512, 393, 100])


prepare target label

In [12]:
seq_ids_padded[0][:50]

tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])

In [13]:
targets_padded = torch.zeros_like(seq_ids_padded)
padding_id = 0

for i in range(seq_ids_padded.size(0)):
    targets_padded[i, :-1] = seq_ids_padded[i, 1:] # Shift the sequence to the left by 1
    targets_padded[i, -1] = padding_id # Set the last token to be the padding token


In [14]:
# Test result
print('targets_padded:', targets_padded.size())
print('last column of targets_padded:', targets_padded[:, -1][:10])

print('seq_ids_padded[0][:50]:', seq_ids_padded[0][:50])
print('targets_padded[0][:50]:', targets_padded[0][:50])

# You should expect to see:
# targets_padded: torch.Size([512, 393])
# last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


targets_padded: torch.Size([512, 393])
last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
seq_ids_padded[0][:50]: tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])
targets_padded[0][:50]: tensor([  5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,  15,
        267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,  30,
          9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])


### Training Data

In [15]:
train_seq_ids = seq_ids
train_seq_lens = seq_lens

### START YOUR CODE ###
targets_padded = torch.zeros_like(seq_ids_padded)
padding_id = 0

for i in range(seq_ids_padded.size(0)):
    targets_padded[i, :-1] = seq_ids_padded[i, 1:] # Shift the sequence to the left by 1
    targets_padded[i, -1] = padding_id # Set the last token to be the padding token

### END YOUR CODE ###

# Test result
print('targets_padded:', targets_padded.size())
print('last column of targets_padded[:20]:', targets_padded[:, -1][:20])

print('seq_ids_padded[0][:50]:', seq_ids_padded[0][:50])
print('targets_padded[0][:50]:', targets_padded[0][:50])
# You should expect to see:
# targets_padded: torch.Size([16, 85])
# last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

targets_padded: torch.Size([512, 393])
last column of targets_padded[:20]: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
seq_ids_padded[0][:50]: tensor([  2,   5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,
         15, 267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,
         30,   9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])
targets_padded[0][:50]: tensor([  5,   4,  47,   9, 225, 545,   6,   1,   7,  66, 131,  20,  10,  15,
        267, 132, 106, 179, 246,   1,   7,  66,  64,  20,  10,  12,   7,  30,
          9,   7, 546,   1,   7,  66,  19,   2,  20,  10,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])


### 2. Build the Model

In [16]:
embedding_rand = nn.Embedding(len(word2id), 50, padding_idx=0)
embedding_rand.weight.data.size()

torch.Size([1353, 50])

Model Architecture

<img src="../images/rnn_lm.png" alt="RNN LM" width="1000">

In [17]:
class RNNLM(nn.Module):
    def __init__(self, embedding: nn.Embedding):
        super(RNNLM, self).__init__()
        self.embedding = embedding
        self.rnn = nn.RNN(embedding.embedding_dim, hidden_size=100, batch_first=True)
        self.fc = nn.Linear(100, len(word2id))
        

    def forward(self, seq, seq_lens): # pass in raw word ids and sequence lengths
        padded_seqs = nn.utils.rnn.pad_sequence(seq, batch_first=True)
        padded_embs = self.embedding(padded_seqs)
        packed_embs = nn.utils.rnn.pack_padded_sequence(padded_embs, seq_lens.cpu(), batch_first=True, enforce_sorted=False)
        out_packed, _ = self.rnn(packed_embs)
        out_unpacked, _ = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
        # print(out_unpacked.size()) # ([512, 393, 100])
        logits = self.fc(out_unpacked)
        log_probs = F.log_softmax(logits, dim=-1)
        return log_probs

### 3. Train and Evaluate

In [18]:
from torch import optim

In [19]:
# 初始化模型
model_rand = RNNLM(embedding_rand)
learning_rate = 0.03

In [20]:
loss_fn = nn.NLLLoss(ignore_index=0, reduction='none')
optimizer = optim.Adam(model_rand.parameters(), lr=learning_rate)

  from .autonotebook import tqdm as notebook_tqdm


Model Architecture

<img src="../images/perplexity.png" alt="perplexity" width="1000">

In [21]:
def train(model: RNNLM, seq, seq_len, targets_padded, loss_fn, optimizer, n_epochs=10):
    for epoch in range(n_epochs):
        model.train()

        optimizer.zero_grad()
        log_probs = model.forward(seq, seq_len)

        loss = loss_fn(log_probs.view(-1, len(word2id)), targets_padded.view(-1))
        loss = loss.mean()
        loss.backward()
        perplexity = torch.exp(loss)        
        optimizer.step()
        print(f'Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}, Perplexity: {perplexity.item()}')


In [22]:
def evaluate(model: RNNLM, seq, seq_len, targets_padded, loss_fn):
    model.eval()
    with torch.no_grad():
        log_probs = model.forward(seq, seq_len)
        loss = loss_fn(log_probs.view(-1, len(word2id)), targets_padded.view(-1))
        loss = loss.mean()
        perplexity = torch.exp(loss)
        print(f'Evaluation Loss: {loss.item()}')
        print(f'Perplexity: {perplexity.item()}')
        

In [23]:
train(model_rand, seq_ids, seq_lens, targets_padded, loss_fn, optimizer, n_epochs=25)

Epoch 1/25, Loss: 0.6947746872901917, Perplexity: 2.0032577514648438
Epoch 2/25, Loss: 0.6268472075462341, Perplexity: 1.8717001676559448
Epoch 3/25, Loss: 0.5019248127937317, Perplexity: 1.6518977880477905
Epoch 4/25, Loss: 0.4673384130001068, Perplexity: 1.5957412719726562
Epoch 5/25, Loss: 0.46292901039123535, Perplexity: 1.5887205600738525
Epoch 6/25, Loss: 0.4716038107872009, Perplexity: 1.6025623083114624
Epoch 7/25, Loss: 0.4523650109767914, Perplexity: 1.5720256567001343
Epoch 8/25, Loss: 0.4347092807292938, Perplexity: 1.5445139408111572
Epoch 9/25, Loss: 0.4207722842693329, Perplexity: 1.5231374502182007
Epoch 10/25, Loss: 0.40994134545326233, Perplexity: 1.5067293643951416
Epoch 11/25, Loss: 0.3994125723838806, Perplexity: 1.4909486770629883
Epoch 12/25, Loss: 0.389690101146698, Perplexity: 1.4765231609344482
Epoch 13/25, Loss: 0.3803129196166992, Perplexity: 1.4627422094345093
Epoch 14/25, Loss: 0.37196075916290283, Perplexity: 1.4505760669708252
Epoch 15/25, Loss: 0.363483

### 4. Experiments

### Compute Perplexity (on training data)

Finally, compute the perplexity by exponentiating the average loss per sequence.

See the documentation here: https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html


In [24]:
# random embedding
evaluate(model_rand, seq_ids, seq_lens, targets_padded, loss_fn)

Evaluation Loss: 0.28542959690093994
Perplexity: 1.3303333520889282


### Generate some sentences

In [25]:
def get_sentence(model, seq, max_length=20):
    model.eval()
    with torch.no_grad():
        current_tokens = seq
        for _ in range(max_length):
            current_tokens_tensor = torch.tensor([[word2id[word] for word in current_tokens]])
            seq_lens = torch.tensor([len(current_tokens)])
            # 调用模型，获取下一个单词的概率分布
            log_probs = model(current_tokens_tensor, seq_lens)
            # 从概率分布中采样下一个单词的索引
            next_word_index = torch.argmax(log_probs[:, -1, :], dim=-1).item()
            next_word = id2word[next_word_index]
            current_tokens.append(next_word)
            if next_word == '。':
                break
        return ''.join(current_tokens)

In [26]:
seq = ['天','下']
max_length = 20
get_sentence(model_rand, seq, max_length)

'天下之，曰：不知也。'

In [27]:
seq = ['子','曰']
max_length = 20
get_sentence(model_rand, seq, max_length)

'子曰：君子有三戒，不亦可谓也。'