# Задание¶
- Сгенерировать последовательности, которые бы состояли из цифр (от 0 до 9) и задавались следующим образом: 
  - x - последовательность цифр
  - y1 = x1, y(i) = x(i) + x(1). Если y(i) >= 10, то y(i) = y(i) - 10
- научить модель предсказывать y(i) по x(i)
- попробовать RNN, LSTM, GRU

In [1]:
import torch
from torch import nn
import numpy as np
import re
import random
import tqdm
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Создание датасета по алгоритму: 

    y1 = x1, y(i) = x(i) + x(1). Если y(i) >= 10, то y(i) = y(i) - 10 

In [4]:
def generate_xy(sequence_len=100, batch_size=1, torch_type = torch.long):
    # torch_type = torch.long
    assert batch_size >= 1, 'Wrong batch size'
    if sequence_len is None:
        sequence_len = np.random.randint(10, 100)
        
    X = torch.zeros((batch_size, sequence_len), dtype=torch_type, device=device)
    y = torch.zeros((batch_size, sequence_len), dtype=torch_type, device=device)

    for i in range(batch_size):
        X_line = np.random.randint(0, 9, size=[sequence_len])
        y_line = X_line + X_line[0]
        y_line = np.vectorize(lambda x: x if x < 10 else x - 10)(y_line)
        y_line[0] = X_line[0]
        X[i] = torch.tensor(X_line, dtype=torch_type, device=device)
        y[i] = torch.tensor(y_line, dtype=torch_type, device=device)
    return X, y

In [5]:
X, y = generate_xy(sequence_len=10, batch_size=10, torch_type=torch.long)

In [6]:
X

tensor([[8, 1, 7, 0, 1, 0, 3, 7, 4, 1],
        [0, 4, 7, 3, 3, 4, 7, 4, 5, 6],
        [1, 0, 5, 3, 8, 2, 1, 0, 1, 4],
        [0, 4, 5, 2, 0, 2, 2, 4, 3, 3],
        [0, 0, 2, 2, 6, 5, 5, 5, 7, 2],
        [1, 8, 0, 4, 8, 3, 4, 3, 7, 0],
        [6, 6, 3, 1, 7, 3, 5, 7, 7, 1],
        [3, 5, 8, 6, 4, 2, 3, 5, 5, 5],
        [0, 1, 8, 3, 7, 3, 6, 3, 8, 3],
        [3, 0, 0, 2, 0, 8, 6, 2, 6, 3]])

In [7]:
y

tensor([[8, 9, 5, 8, 9, 8, 1, 5, 2, 9],
        [0, 4, 7, 3, 3, 4, 7, 4, 5, 6],
        [1, 1, 6, 4, 9, 3, 2, 1, 2, 5],
        [0, 4, 5, 2, 0, 2, 2, 4, 3, 3],
        [0, 0, 2, 2, 6, 5, 5, 5, 7, 2],
        [1, 9, 1, 5, 9, 4, 5, 4, 8, 1],
        [6, 2, 9, 7, 3, 9, 1, 3, 3, 7],
        [3, 8, 1, 9, 7, 5, 6, 8, 8, 8],
        [0, 1, 8, 3, 7, 3, 6, 3, 8, 3],
        [3, 3, 3, 5, 3, 1, 9, 5, 9, 6]])

Сеть

In [8]:
class NeuralNetwork(nn.Module):
    def _print(self, *text):
        if self.debug:
            print(*text)
    
    def __init__(self, rnnClass, input_size, embedding_size, num_hiddens, num_classes, debug=False):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = nn.Linear(num_hiddens, num_classes)
        self.debug = debug
        
    def forward(self, X):
        out = self.embedding(X)
        if type(self.hidden) == nn.LSTM:
            _, (state, _) = self.hidden(out)
        else:
            _, state = self.hidden(out)
        self._print(f'model.forward| state : {state.shape}, state[0] : {state[0].shape}')
        predictions = self.output(state[0])
        self._print(f'model.forward| predictions : {predictions.shape}' )
        return predictions

In [9]:
model = NeuralNetwork(rnnClass=nn.LSTM, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10, debug=True
                     )

model = model.cuda() if torch.cuda.is_available() else model.cpu()

In [10]:
torch.tensor([1,2,3,4,3,2,1]).argmax()

tensor(3)

In [11]:
X, y = generate_xy(sequence_len=10, batch_size=1)
print(X.shape, X)
print(y.shape, y)
answers = model.forward(X)
print('answers : ', answers.shape, np.argmax(answers.detach().numpy()))

torch.Size([1, 10]) tensor([[2, 0, 6, 2, 7, 0, 2, 5, 6, 5]])
torch.Size([1, 10]) tensor([[2, 2, 8, 4, 9, 2, 4, 7, 8, 7]])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
answers :  torch.Size([1, 10]) 9


In [12]:
def generate_sequence(model, sequence_len=10, debug=False):
    X, y = generate_xy(sequence_len=sequence_len, batch_size=1)    
    if debug:
        print('X: ', X)
        print('y: ', y)
    
    preds = torch.zeros( (1, sequence_len), dtype=int, device=device)
    for i in range(sequence_len):
        next_digit = model(X[:, :(i + 1)])
        preds[0, i] = next_digit.argmax()
        if debug:
            print(f'======= i = {i}')
            print('X[:, :(i + 1)] : ', X[:, :(i + 1)])
            print('next_digit : ', next_digit, next_digit.argmax())
            print('next_digit softmax: ', torch.softmax(next_digit, 1), torch.softmax(next_digit, 1).argmax())
            print('pred : ', preds)

    print(y, preds)

In [13]:
generate_sequence(model, debug=False)

model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torch.Size([1, 10])
model.forward| state : torch.Size([1, 1, 64]), state[0] : torch.Size([1, 64])
model.forward| predictions : torc

Тестовое обучение

In [14]:
X, y = generate_xy(sequence_len=5, batch_size=1) 
print(X, y)
for i in range( 5 ):
    print(X[:, :(i+1)], y[:, i])

tensor([[2, 4, 5, 5, 6]]) tensor([[2, 6, 7, 7, 8]])
tensor([[2]]) tensor([2])
tensor([[2, 4]]) tensor([6])
tensor([[2, 4, 5]]) tensor([7])
tensor([[2, 4, 5, 5]]) tensor([7])
tensor([[2, 4, 5, 5, 6]]) tensor([8])


In [15]:
def train(model, criterion, optimizer, epochs=300, sequence_len=100, batch_size=100):
    for ep in range(epochs + 1):
        start = time.time()
        train_loss = 0.
        train_passed = 0

        model.train()
        X, y = generate_xy(sequence_len=sequence_len, batch_size=batch_size)
        for i in range( sequence_len ):
            optimizer.zero_grad()
            model.zero_grad()
            answers = model.forward(X[:, :(i + 1)])
            loss = criterion(answers, y[:, i])        
            train_loss += loss.item()

            loss.backward()
            optimizer.step()
            train_passed += 1

        if ep % 50 == 0 and ep != 0:
            print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))
            model.eval()
            generate_sequence(model)

Обучение

- RNN

In [16]:
model = NeuralNetwork(rnnClass=nn.RNN, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10, debug=False)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.021, Train loss: 2.204
tensor([[5, 2, 9, 8, 6, 9, 7, 0, 6, 6]]) tensor([[5, 7, 4, 3, 1, 4, 2, 5, 3, 3]])
Epoch 100. Time: 0.022, Train loss: 1.964
tensor([[7, 5, 0, 2, 0, 8, 9, 8, 0, 5]]) tensor([[7, 8, 3, 9, 3, 9, 3, 3, 1, 9]])
Epoch 150. Time: 0.022, Train loss: 1.844
tensor([[6, 9, 1, 1, 0, 6, 2, 6, 4, 3]]) tensor([[6, 9, 9, 2, 9, 2, 3, 2, 0, 2]])
Epoch 200. Time: 0.022, Train loss: 1.581
tensor([[6, 4, 6, 8, 2, 0, 2, 4, 1, 1]]) tensor([[6, 8, 6, 6, 8, 1, 7, 5, 8, 8]])
Epoch 250. Time: 0.027, Train loss: 1.188
tensor([[3, 4, 1, 7, 4, 5, 5, 7, 1, 9]]) tensor([[3, 1, 5, 1, 3, 7, 4, 6, 3, 3]])
Epoch 300. Time: 0.022, Train loss: 1.437
tensor([[2, 8, 5, 3, 9, 7, 7, 0, 6, 6]]) tensor([[2, 8, 9, 3, 9, 7, 7, 6, 0, 0]])
Epoch 350. Time: 0.021, Train loss: 1.263
tensor([[3, 3, 4, 0, 0, 4, 0, 7, 3, 6]]) tensor([[3, 3, 4, 8, 8, 8, 8, 9, 1, 8]])
Epoch 400. Time: 0.022, Train loss: 0.842
tensor([[4, 9, 5, 5, 9, 4, 4, 2, 4, 6]]) tensor([[4, 9, 5, 5, 2, 1, 7, 5, 3, 9]])
Epoch 450

- GRU

In [17]:
model = NeuralNetwork(rnnClass=nn.GRU, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.033, Train loss: 2.182
tensor([[3, 8, 6, 9, 0, 6, 0, 7, 6, 5]]) tensor([[3, 5, 3, 6, 7, 3, 7, 4, 3, 9]])
Epoch 100. Time: 0.032, Train loss: 1.884
tensor([[1, 9, 4, 1, 2, 4, 8, 2, 6, 4]]) tensor([[1, 8, 3, 3, 1, 3, 3, 1, 8, 8]])
Epoch 150. Time: 0.033, Train loss: 0.962
tensor([[6, 8, 9, 0, 8, 9, 0, 1, 7, 3]]) tensor([[6, 6, 9, 6, 6, 9, 6, 7, 7, 9]])
Epoch 200. Time: 0.034, Train loss: 0.392
tensor([[0, 8, 2, 3, 0, 6, 1, 3, 4, 2]]) tensor([[0, 8, 2, 3, 0, 6, 1, 3, 4, 2]])
Epoch 250. Time: 0.034, Train loss: 0.188
tensor([[6, 3, 0, 9, 0, 0, 8, 2, 6, 3]]) tensor([[6, 3, 0, 9, 0, 0, 8, 2, 6, 3]])
Epoch 300. Time: 0.032, Train loss: 0.089
tensor([[7, 9, 1, 5, 9, 2, 3, 2, 7, 9]]) tensor([[7, 9, 1, 5, 9, 2, 3, 2, 7, 9]])
Epoch 350. Time: 0.033, Train loss: 0.059
tensor([[1, 9, 6, 4, 3, 6, 2, 2, 4, 1]]) tensor([[1, 9, 6, 4, 3, 6, 2, 2, 4, 1]])
Epoch 400. Time: 0.033, Train loss: 0.047
tensor([[4, 9, 0, 8, 9, 2, 0, 5, 4, 8]]) tensor([[4, 9, 0, 8, 9, 2, 0, 5, 4, 8]])
Epoch 450

- LSTM

In [18]:
model = NeuralNetwork(rnnClass=nn.LSTM, input_size=10, embedding_size=64, 
                      num_hiddens=64, num_classes=10)

model = model.cuda() if torch.cuda.is_available() else model.cpu()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model, criterion, optimizer, epochs=1000, sequence_len=10, batch_size=10)

Epoch 50. Time: 0.033, Train loss: 2.079
tensor([[5, 5, 8, 6, 5, 7, 3, 2, 0, 2]]) tensor([[5, 7, 5, 5, 7, 5, 8, 0, 5, 7]])
Epoch 100. Time: 0.033, Train loss: 1.122
tensor([[2, 2, 2, 5, 8, 7, 5, 9, 5, 5]]) tensor([[2, 2, 2, 5, 8, 7, 5, 9, 5, 5]])
Epoch 150. Time: 0.034, Train loss: 0.281
tensor([[3, 0, 1, 1, 8, 0, 4, 4, 1, 4]]) tensor([[3, 0, 1, 1, 8, 0, 4, 4, 1, 4]])
Epoch 200. Time: 0.032, Train loss: 0.106
tensor([[3, 8, 7, 3, 8, 5, 6, 8, 1, 0]]) tensor([[3, 8, 7, 3, 8, 5, 6, 8, 1, 0]])
Epoch 250. Time: 0.031, Train loss: 0.051
tensor([[6, 7, 3, 4, 3, 7, 2, 9, 9, 0]]) tensor([[6, 7, 3, 4, 3, 7, 2, 9, 9, 0]])
Epoch 300. Time: 0.032, Train loss: 0.027
tensor([[3, 9, 1, 5, 6, 6, 3, 0, 4, 3]]) tensor([[3, 9, 1, 5, 6, 6, 3, 0, 4, 3]])
Epoch 350. Time: 0.032, Train loss: 0.018
tensor([[0, 3, 1, 6, 8, 8, 1, 6, 1, 0]]) tensor([[0, 3, 1, 6, 8, 8, 1, 6, 1, 0]])
Epoch 400. Time: 0.032, Train loss: 0.010
tensor([[2, 2, 4, 6, 8, 8, 9, 8, 3, 4]]) tensor([[2, 2, 4, 6, 8, 8, 9, 8, 3, 4]])
Epoch 450

По результату - GRU и LSTM оказываются на одном уровне качества по сходимости, в то время как ванильная RNN существенно проигрывает в качестве, выигрывая в скорости обучения