<a href="https://colab.research.google.com/github/nedokormysh/Stepik_Ai_edu_RNN/blob/week_5_char_RNN/AiEdu_CharRNN_Hometask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание

В этом домашнем задании вы обучите рекуррентную сеть для генерации текстов в стиле Шекспира.

In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

## Загрузим данные

Загрузим текстовый файл с пьесами Шекспира.

In [2]:
!wget https://raw.githubusercontent.com/aiedu-courses/rnn_bootcamp/main/shakespeare.txt

--2024-01-15 10:16:38--  https://raw.githubusercontent.com/aiedu-courses/rnn_bootcamp/main/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2024-01-15 10:16:39 (20.0 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



*Эта задача аналогична задаче, разобранной на вебинаре, поэтому код мы вам не предоставляем, а предлагаем или написать с нуля, или воспользоваться кодом с вебинара.*

In [3]:
# open text file and read in data as `text`
with open("shakespeare.txt", "r") as f:
    text = f.read()

In [4]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

### Токенизация

In [5]:
chars = tuple(set(text))

int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [6]:
int2char.items()

dict_items([(0, 'v'), (1, 'P'), (2, 'g'), (3, 'b'), (4, 'Y'), (5, 'q'), (6, 'X'), (7, '?'), (8, 'j'), (9, "'"), (10, 'c'), (11, 'F'), (12, 's'), (13, 'S'), (14, 'E'), (15, 'y'), (16, '!'), (17, 'r'), (18, 'O'), (19, 'e'), (20, 'W'), (21, '$'), (22, '.'), (23, 'w'), (24, 'R'), (25, 'B'), (26, 'h'), (27, 'x'), (28, 'U'), (29, 'T'), (30, 'u'), (31, 'a'), (32, 'V'), (33, 'G'), (34, 'N'), (35, '&'), (36, 'K'), (37, 'M'), (38, 'l'), (39, ','), (40, 'Z'), (41, 'p'), (42, '-'), (43, 't'), (44, 'd'), (45, 'z'), (46, 'f'), (47, 'A'), (48, 'n'), (49, 'k'), (50, 'C'), (51, 'D'), (52, 'J'), (53, '\n'), (54, 'L'), (55, ' '), (56, ':'), (57, ';'), (58, 'o'), (59, 'Q'), (60, 'i'), (61, 'I'), (62, 'm'), (63, '3'), (64, 'H')])

In [7]:
# int2char

In [8]:
for ii, ch in int2char.items():
  print(ii, ch)

0 v
1 P
2 g
3 b
4 Y
5 q
6 X
7 ?
8 j
9 '
10 c
11 F
12 s
13 S
14 E
15 y
16 !
17 r
18 O
19 e
20 W
21 $
22 .
23 w
24 R
25 B
26 h
27 x
28 U
29 T
30 u
31 a
32 V
33 G
34 N
35 &
36 K
37 M
38 l
39 ,
40 Z
41 p
42 -
43 t
44 d
45 z
46 f
47 A
48 n
49 k
50 C
51 D
52 J
53 

54 L
55  
56 :
57 ;
58 o
59 Q
60 i
61 I
62 m
63 3
64 H


## Предобработка данных

In [9]:
def one_hot_encode(arr, n_labels):

    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.0
    # print(arr)
    # print(arr.flatten())
    # print(np.arange(one_hot.shape[0]))
    # print(one_hot[1])


    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

## Создаем мини-батчи (mini-batchs)


In [10]:
def get_batches(int_words, batch_size, seq_length):
    # 1. Truncate text, so there are only full batches
    window_size = seq_length + 1
    batch_size_total = batch_size * window_size
    n_batches = len(int_words) // batch_size_total
    int_words = int_words[: n_batches * batch_size_total]

    # 2. Reshape into batch_size rows
    int_words = int_words.reshape((batch_size, -1))

    # 3. Iterate through the text matrix
    for position in range(0, int_words.shape[1], window_size):
        x = int_words[:, position : position + window_size - 1]
        y = int_words[:, position + 1 : position + window_size]
        yield x, y

## Зададим архитектуру

In [11]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print("Training on GPU!")
else:
    print("No GPU available, training on CPU; consider making n_epochs very small.")

Training on GPU!


In [12]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        ## TODO: define the LSTM
        self.lstm = nn.LSTM(
            len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True
        )

        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))

    def forward(self, x, hidden):
        """Forward pass through the network.
        These inputs are x, and the hidden/cell state `hidden`."""

        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)

        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)

        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)

        ## TODO: put x through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden

    def init_hidden(self, batch_size):
        """Initializes hidden state"""
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if train_on_gpu:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
            )
        else:
            hidden = (
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
            )

        return hidden

## Обучим модель

In [13]:
def train(
    net,
    data,
    epochs=10,
    batch_size=10,
    seq_length=50,
    lr=0.001,
    clip=5,
    val_frac=0.1,
    print_every=10,
):
    net.train()

    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # create training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]

    if train_on_gpu:
        net.cuda()

    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1

            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if train_on_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size * seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x, y
                    if train_on_gpu:
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(
                        output, targets.view(batch_size * seq_length).long()
                    )

                    val_losses.append(val_loss.item())

                net.train()  # reset to train mode after iterationg through validation data

                print(
                    "Epoch: {}/{}...".format(e + 1, epochs),
                    "Step: {}...".format(counter),
                    "Loss: {:.4f}...".format(loss.item()),
                    "Val Loss: {:.4f}".format(np.mean(val_losses)),
                )

## Определим модель

In [14]:
# define and print the net
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(65, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=65, bias=True)
)


### Установим гиперпараметры

In [15]:
batch_size = 128
seq_length = 100
n_epochs = 20  # start smaller if you are just testing initial behavior

# train the model
train(
    net,
    encoded,
    epochs=n_epochs,
    batch_size=batch_size,
    seq_length=seq_length,
    lr=0.001,
    print_every=10,
)

Epoch: 1/20... Step: 10... Loss: 3.3613... Val Loss: 3.3818
Epoch: 1/20... Step: 20... Loss: 3.3313... Val Loss: 3.3508
Epoch: 1/20... Step: 30... Loss: 3.3208... Val Loss: 3.3470
Epoch: 1/20... Step: 40... Loss: 3.3502... Val Loss: 3.3420
Epoch: 1/20... Step: 50... Loss: 3.3359... Val Loss: 3.3388
Epoch: 1/20... Step: 60... Loss: 3.2920... Val Loss: 3.3235
Epoch: 1/20... Step: 70... Loss: 3.3182... Val Loss: 3.3301
Epoch: 2/20... Step: 80... Loss: 3.2840... Val Loss: 3.3174
Epoch: 2/20... Step: 90... Loss: 3.2755... Val Loss: 3.2900
Epoch: 2/20... Step: 100... Loss: 3.2321... Val Loss: 3.2253
Epoch: 2/20... Step: 110... Loss: 3.1436... Val Loss: 3.1180
Epoch: 2/20... Step: 120... Loss: 3.0570... Val Loss: 3.0178
Epoch: 2/20... Step: 130... Loss: 2.9434... Val Loss: 3.0847
Epoch: 2/20... Step: 140... Loss: 2.9272... Val Loss: 2.8712
Epoch: 2/20... Step: 150... Loss: 2.8016... Val Loss: 2.7509
Epoch: 3/20... Step: 160... Loss: 2.7138... Val Loss: 2.6619
Epoch: 3/20... Step: 170... Loss:

# Задание 1

Выведите общее число параметров (весов) сети, генерирующей тексты в стиле Шекспира. Сеть задавайте такую же, как и в ноутбуке на вебинаре, с теми же гиперпараметрами.

Подсказка: число параметров на каждом слое сети `model` можно посмотреть так:

for layer in net.parameters():

    ....

In [16]:
sum = 0
for layer in net.parameters():
  # print(layer.numel())
  sum += layer.numel()

sum

3320385

In [17]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

count_parameters(net)

+-------------------+------------+
|      Modules      | Parameters |
+-------------------+------------+
| lstm.weight_ih_l0 |   133120   |
| lstm.weight_hh_l0 |  1048576   |
|  lstm.bias_ih_l0  |    2048    |
|  lstm.bias_hh_l0  |    2048    |
| lstm.weight_ih_l1 |  1048576   |
| lstm.weight_hh_l1 |  1048576   |
|  lstm.bias_ih_l1  |    2048    |
|  lstm.bias_hh_l1  |    2048    |
|     fc.weight     |   33280    |
|      fc.bias      |     65     |
+-------------------+------------+
Total Trainable Params: 3320385


3320385

In [18]:
# only trainable parameters
pytorch_total_params = np.array([p.numel() for p in net.parameters() if p.requires_grad]).sum()
pytorch_total_params

3320385

# Задание 2

Будет ли сеть обучаться, если задать learning rate равным 1?

In [None]:
# batch_size = 128
# seq_length = 100
# n_epochs = 20  # start smaller if you are just testing initial behavior

# # train the model
# train(
#     net,
#     encoded,
#     epochs=n_epochs,
#     batch_size=batch_size,
#     seq_length=seq_length,
#     lr=1,
#     print_every=10,
# )

Epoch: 1/20... Step: 10... Loss: 136.0693... Val Loss: 127.9466
Epoch: 1/20... Step: 20... Loss: 146.7548... Val Loss: 139.7564
Epoch: 1/20... Step: 30... Loss: 105.7887... Val Loss: 90.8751
Epoch: 1/20... Step: 40... Loss: 89.1725... Val Loss: 81.9634
Epoch: 1/20... Step: 50... Loss: 77.8061... Val Loss: 66.6790
Epoch: 1/20... Step: 60... Loss: 59.7220... Val Loss: 57.7956
Epoch: 1/20... Step: 70... Loss: 66.3815... Val Loss: 67.5402
Epoch: 2/20... Step: 80... Loss: 70.9837... Val Loss: 53.4431
Epoch: 2/20... Step: 90... Loss: 53.7087... Val Loss: 34.0933
Epoch: 2/20... Step: 100... Loss: 31.6670... Val Loss: 19.4136
Epoch: 2/20... Step: 110... Loss: 19.7305... Val Loss: 14.2371
Epoch: 2/20... Step: 120... Loss: 22.0205... Val Loss: 22.6981
Epoch: 2/20... Step: 130... Loss: 39.4527... Val Loss: 38.1751
Epoch: 2/20... Step: 140... Loss: 55.6302... Val Loss: 53.0616
Epoch: 2/20... Step: 150... Loss: 66.6802... Val Loss: 59.6406
Epoch: 3/20... Step: 160... Loss: 49.9490... Val Loss: 43.3

# Задание 3

В комментарии напишите кусочек текста в стиле Шекспира, сгененированного вашей моделью. Выберите кусочек, который вам больше всего понравился!

## Делаем предсказания

In [19]:
def predict(net, char, h=None, top_k=None):
    """Given a character, predict the next character.
    Returns the predicted character and the hidden state.
    """

    # tensor inputs
    x = np.array([[net.char2int[char]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)

    if train_on_gpu:
        inputs = inputs.cuda()

    # detach hidden state from history
    h = tuple([each.data for each in h])
    # get the output of the model
    out, h = net(inputs, h)

    # get the character probabilities
    p = F.softmax(out, dim=1).data
    if train_on_gpu:
        p = p.cpu()  # move to cpu

    # get top characters
    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    # select the likely next character with some element of randomness
    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p / p.sum())

    # return the encoded value of the predicted char and the hidden state
    return net.int2char[char], h

## Priming

In [20]:
def sample(net, size, prime="The", top_k=None):

    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()

    net.eval()  # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return "".join(chars)

In [21]:
print(sample(net, 500, prime="The King Lear", top_k=5))

The King Learing,
The well that is me to the sent to speak;
A though thou senseling as I dead to mischarged,
Take thised the weary that with him, and some
This to be thou to take his livery: to make you,
Treat those offectors is the welcome, well,
Who shall been the present too that stay.

KING RICHARD III:
Thas, the may life, we can thy lons.

CORIOLANUS:
He may not send to stor to him, that whils
Ore wearthy whict to make yet still be now,
That some wind with that was with a thate was as it,
As is my shore 
