<a href="https://colab.research.google.com/github/nedokormysh/Stepik_Ai_edu_RNN/blob/week_5_char_RNN/AiEdu_CharRNN_Hometask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание

В этом домашнем задании вы обучите рекуррентную сеть для генерации текстов в стиле Шекспира.

In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

## Загрузим данные

Загрузим текстовый файл с пьесами Шекспира.

In [2]:
!wget https://raw.githubusercontent.com/aiedu-courses/rnn_bootcamp/main/shakespeare.txt

--2024-01-15 10:50:42--  https://raw.githubusercontent.com/aiedu-courses/rnn_bootcamp/main/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2024-01-15 10:50:42 (57.3 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



*Эта задача аналогична задаче, разобранной на вебинаре, поэтому код мы вам не предоставляем, а предлагаем или написать с нуля, или воспользоваться кодом с вебинара.*

In [3]:
# open text file and read in data as `text`
with open("shakespeare.txt", "r") as f:
    text = f.read()

In [4]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

### Токенизация

In [5]:
chars = tuple(set(text))

int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [6]:
int2char.items()

dict_items([(0, 'V'), (1, '3'), (2, 'H'), (3, 'S'), (4, '?'), (5, 'J'), (6, 's'), (7, 'r'), (8, '.'), (9, 'Z'), (10, 'z'), (11, 'G'), (12, '$'), (13, 'B'), (14, 'Y'), (15, 'A'), (16, 'I'), (17, 'm'), (18, 'b'), (19, '\n'), (20, 'R'), (21, '&'), (22, 'y'), (23, 'P'), (24, '!'), (25, 'D'), (26, 'q'), (27, ':'), (28, 'd'), (29, 'p'), (30, "'"), (31, 'O'), (32, 'a'), (33, 'X'), (34, 'n'), (35, ' '), (36, 'o'), (37, 'c'), (38, 'M'), (39, 'Q'), (40, 'L'), (41, 'v'), (42, 'l'), (43, 'W'), (44, 'j'), (45, 'i'), (46, 'h'), (47, 'C'), (48, 'w'), (49, 'T'), (50, 'x'), (51, 'E'), (52, ';'), (53, 'f'), (54, ','), (55, 'g'), (56, '-'), (57, 'K'), (58, 'e'), (59, 'U'), (60, 'k'), (61, 'F'), (62, 't'), (63, 'N'), (64, 'u')])

In [7]:
# int2char

In [8]:
for ii, ch in int2char.items():
  print(ii, ch)

0 V
1 3
2 H
3 S
4 ?
5 J
6 s
7 r
8 .
9 Z
10 z
11 G
12 $
13 B
14 Y
15 A
16 I
17 m
18 b
19 

20 R
21 &
22 y
23 P
24 !
25 D
26 q
27 :
28 d
29 p
30 '
31 O
32 a
33 X
34 n
35  
36 o
37 c
38 M
39 Q
40 L
41 v
42 l
43 W
44 j
45 i
46 h
47 C
48 w
49 T
50 x
51 E
52 ;
53 f
54 ,
55 g
56 -
57 K
58 e
59 U
60 k
61 F
62 t
63 N
64 u


## Предобработка данных

In [9]:
def one_hot_encode(arr, n_labels):

    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.0
    # print(arr)
    # print(arr.flatten())
    # print(np.arange(one_hot.shape[0]))
    # print(one_hot[1])


    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

## Создаем мини-батчи (mini-batchs)


In [10]:
def get_batches(int_words, batch_size, seq_length):
    # 1. Truncate text, so there are only full batches
    window_size = seq_length + 1
    batch_size_total = batch_size * window_size
    n_batches = len(int_words) // batch_size_total
    int_words = int_words[: n_batches * batch_size_total]

    # 2. Reshape into batch_size rows
    int_words = int_words.reshape((batch_size, -1))

    # 3. Iterate through the text matrix
    for position in range(0, int_words.shape[1], window_size):
        x = int_words[:, position : position + window_size - 1]
        y = int_words[:, position + 1 : position + window_size]
        yield x, y

## Зададим архитектуру

In [11]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print("Training on GPU!")
else:
    print("No GPU available, training on CPU; consider making n_epochs very small.")

Training on GPU!


In [12]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        ## TODO: define the LSTM
        self.lstm = nn.LSTM(
            len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True
        )

        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))

    def forward(self, x, hidden):
        """Forward pass through the network.
        These inputs are x, and the hidden/cell state `hidden`."""

        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)

        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)

        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)

        ## TODO: put x through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden

    def init_hidden(self, batch_size):
        """Initializes hidden state"""
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
            )
        else:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
            )

        return hidden

## Обучим модель

In [13]:
def train(
    net,
    data,
    epochs=10,
    batch_size=10,
    seq_length=50,
    lr=0.001,
    clip=5,
    val_frac=0.1,
    print_every=10,
):
    net.train()

    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # create training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]

    if train_on_gpu:
        net.cuda()

    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1

            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if train_on_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size * seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x, y
                    if train_on_gpu:
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(
                        output, targets.view(batch_size * seq_length).long()
                    )

                    val_losses.append(val_loss.item())

                net.train()  # reset to train mode after iterationg through validation data

                print(
                    "Epoch: {}/{}...".format(e + 1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.4f}...".format(loss.item()),
                    "Val Loss: {:.4f}".format(np.mean(val_losses)),
                )

## Определим модель

In [14]:
# define and print the net
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(65, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=65, bias=True)
)


### Установим гиперпараметры

In [15]:
batch_size = 128
seq_length = 100
n_epochs = 20  # start smaller if you are just testing initial behavior

# train the model
train(
    net,
    encoded,
    epochs=n_epochs,
    batch_size=batch_size,
    seq_length=seq_length,
    lr=0.001,
    print_every=10,
)

Epoch: 1/20... Step: 10... Loss: 3.3681... Val Loss: 3.4055
Epoch: 1/20... Step: 20... Loss: 3.3303... Val Loss: 3.3551
Epoch: 1/20... Step: 30... Loss: 3.3127... Val Loss: 3.3461
Epoch: 1/20... Step: 40... Loss: 3.3440... Val Loss: 3.3412
Epoch: 1/20... Step: 50... Loss: 3.3337... Val Loss: 3.3358
Epoch: 1/20... Step: 60... Loss: 3.2899... Val Loss: 3.3335
Epoch: 1/20... Step: 70... Loss: 3.3137... Val Loss: 3.3239
Epoch: 2/20... Step: 80... Loss: 3.2742... Val Loss: 3.3098
Epoch: 2/20... Step: 90... Loss: 3.2585... Val Loss: 3.2709
Epoch: 2/20... Step: 100... Loss: 3.2303... Val Loss: 3.2131
Epoch: 2/20... Step: 110... Loss: 3.1656... Val Loss: 3.1471
Epoch: 2/20... Step: 120... Loss: 3.1099... Val Loss: 3.0748
Epoch: 2/20... Step: 130... Loss: 3.0044... Val Loss: 2.9673
Epoch: 2/20... Step: 140... Loss: 2.9214... Val Loss: 2.8677
Epoch: 2/20... Step: 150... Loss: 2.8452... Val Loss: 2.8029
Epoch: 3/20... Step: 160... Loss: 2.7543... Val Loss: 2.7146
Epoch: 3/20... Step: 170... Loss:

# Задание 1

Выведите общее число параметров (весов) сети, генерирующей тексты в стиле Шекспира. Сеть задавайте такую же, как и в ноутбуке на вебинаре, с теми же гиперпараметрами.

Подсказка: число параметров на каждом слое сети `model` можно посмотреть так:

for layer in net.parameters():

    ....

In [16]:
sum = 0
for layer in net.parameters():
  # print(layer.numel())
  sum += layer.numel()

sum

3320385

In [17]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

count_parameters(net)

+-------------------+------------+
|      Modules      | Parameters |
+-------------------+------------+
| lstm.weight_ih_l0 |   133120   |
| lstm.weight_hh_l0 |  1048576   |
|  lstm.bias_ih_l0  |    2048    |
|  lstm.bias_hh_l0  |    2048    |
| lstm.weight_ih_l1 |  1048576   |
| lstm.weight_hh_l1 |  1048576   |
|  lstm.bias_ih_l1  |    2048    |
|  lstm.bias_hh_l1  |    2048    |
|     fc.weight     |   33280    |
|      fc.bias      |     65     |
+-------------------+------------+
Total Trainable Params: 3320385


3320385

In [18]:
# only trainable parameters
pytorch_total_params = np.array([p.numel() for p in net.parameters() if p.requires_grad]).sum()
pytorch_total_params

3320385

# Задание 2

Будет ли сеть обучаться, если задать learning rate равным 1?

In [19]:
# define and print the net
n_hidden = 512
n_layers = 2

net_1 = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(65, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=65, bias=True)
)


In [20]:
batch_size = 128
seq_length = 100
n_epochs = 20  # start smaller if you are just testing initial behavior

# train the model
train(
    net_1 ,
    encoded,
    epochs=n_epochs,
    batch_size=batch_size,
    seq_length=seq_length,
    lr=1,
    print_every=10,
)

Epoch: 1/20... Step: 10... Loss: 833.3704... Val Loss: 903.6915
Epoch: 1/20... Step: 20... Loss: 2166.0168... Val Loss: 2268.6631
Epoch: 1/20... Step: 30... Loss: 2847.2141... Val Loss: 2877.4757
Epoch: 1/20... Step: 40... Loss: 2427.1572... Val Loss: 2278.6945
Epoch: 1/20... Step: 50... Loss: 1935.0138... Val Loss: 1795.6775
Epoch: 1/20... Step: 60... Loss: 979.0643... Val Loss: 967.7872
Epoch: 1/20... Step: 70... Loss: 673.7847... Val Loss: 682.7332
Epoch: 2/20... Step: 80... Loss: 598.1299... Val Loss: 600.8639
Epoch: 2/20... Step: 90... Loss: 497.3957... Val Loss: 411.3968
Epoch: 2/20... Step: 100... Loss: 391.0258... Val Loss: 368.8602
Epoch: 2/20... Step: 110... Loss: 320.0061... Val Loss: 357.0964
Epoch: 2/20... Step: 120... Loss: 221.5636... Val Loss: 242.6220
Epoch: 2/20... Step: 130... Loss: 281.3942... Val Loss: 255.5488
Epoch: 2/20... Step: 140... Loss: 282.5867... Val Loss: 276.1578
Epoch: 2/20... Step: 150... Loss: 362.7218... Val Loss: 369.9529
Epoch: 3/20... Step: 160..

# Задание 3

В комментарии напишите кусочек текста в стиле Шекспира, сгененированного вашей моделью. Выберите кусочек, который вам больше всего понравился!

## Делаем предсказания

In [21]:
def predict(net, char, h=None, top_k=None):
    """Given a character, predict the next character.
    Returns the predicted character and the hidden state.
    """

    # tensor inputs
    x = np.array([[net.char2int[char]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)

    if train_on_gpu:
        inputs = inputs.cuda()

    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = net(inputs, h)

    # get the character probabilities
    p = F.softmax(out, dim=1).data
    if train_on_gpu:
        p = p.cpu()  # move to cpu

    # get top characters
    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    # select the likely next character with some element of randomness
    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p / p.sum())

    # return the encoded value of the predicted char and the hidden state
    return net.int2char[char], h

## Priming

In [22]:
def sample(net, size, prime="The", top_k=None):

    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()

    net.eval()  # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return "".join(chars)

In [23]:
print(sample(net, 500, prime="The King Lear", top_k=5))

The King Learin him; but well
To be the hard war that the will or see,
And have her she there in your gone, the corts
And bosts, that with his should we have my sendel.

KING RICHARD III:
What, to her son, and shill's this than
they, a many were art and the son?

BUCKINGHAM:
What! why had, then, which thy ship of honour stards
The childis of the servisity of his life.

CAPULET:
And as the prince, when I shall shall his hand.
Therefore, with me, the world with me, with his
sear that thou art of thou sport. Tho
