# Рекуррентные нейросети

Построим простейшую нейросеть для посимвольной генерации текста

In [1]:
import pandas as pd  # для работы с данными
import time  # для оценки времени
import torch  # для написания нейросети

from torch import nn

import gc

## Загрузка данных

Будем работать с датасетом реплик из Симпсонов. Нам нужно извлечь предобработанные тексты и закодировать их числами

In [3]:
df = pd.read_csv('../simpsons_script_lines.csv')
df.head()

  df = pd.read_csv('../simpsons_script_lines.csv')


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


In [4]:
phrases = df['normalized_text'].tolist()  # колонка с предобработанными текстами
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [5]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]

## Создаём массив с данными

Нужно

1. Разбить данные на токены (у нас символы)
2. Закодировать числами
3. Превратить в эмбеддинги

In [6]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [7]:
len(INDEX_TO_CHAR)

28

In [8]:
print(INDEX_TO_CHAR)

['none', 'z', 'd', 'a', 'k', 'g', 'f', 'j', 'y', 'v', 'b', ' ', 't', 'w', 'u', 'h', 'q', 'o', 'i', 'r', 's', 'n', 'l', 'm', 'x', 'c', 'p', 'e']


In [9]:
print(CHAR_TO_INDEX)

{'none': 0, 'z': 1, 'd': 2, 'a': 3, 'k': 4, 'g': 5, 'f': 6, 'j': 7, 'y': 8, 'v': 9, 'b': 10, ' ': 11, 't': 12, 'w': 13, 'u': 14, 'h': 15, 'q': 16, 'o': 17, 'i': 18, 'r': 19, 's': 20, 'n': 21, 'l': 22, 'm': 23, 'x': 24, 'c': 25, 'p': 26, 'e': 27}


In [13]:
IND2CHAR =  { v:k for k,v in CHAR_TO_INDEX.items()}
print(IND2CHAR)

{0: 'none', 1: 'e', 2: 't', 3: 'n', 4: 'h', 5: 'q', 6: 'f', 7: 'w', 8: 'g', 9: ' ', 10: 'o', 11: 's', 12: 'j', 13: 'm', 14: 'c', 15: 'd', 16: 'l', 17: 'r', 18: 'a', 19: 'k', 20: 'v', 21: 'i', 22: 'p', 23: 'u', 24: 'b', 25: 'z', 26: 'y', 27: 'x'}


In [8]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [9]:
print(len(X))

132087


In [10]:
batch = X[0:5]
print(batch, batch.shape)
# print(btch[:,:-1], btch[:,:-1].shape)
# print(btch[:,1:], btch[:,1:].shape)

tensor([[ 3, 10,  9, 18, 14,  2, 23, 18, 16, 16, 26,  9, 21,  2,  9,  7, 18, 11,
          9, 18,  9, 16, 21,  2,  2, 16,  1,  9, 10,  6,  9, 24, 10,  2,  4,  9,
         11, 10, 13,  1,  2, 21, 13,  1, 11,  9,  7,  4,  1,  3],
        [ 7,  4,  1, 17,  1, 11,  9, 13, 17,  9, 24,  1, 17,  8, 11,  2, 17, 10,
         13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [21,  9, 15, 10,  3,  2,  9, 19,  3, 10,  7,  9, 18, 16,  2,  4, 10, 23,
          8,  4,  9, 21, 15,  9, 11, 23, 17,  1,  9, 16, 21, 19,  1,  9,  2, 10,
          9,  2, 18, 16, 19,  9,  2, 10,  9,  4, 21, 13,  9,  4],
        [ 2,  4, 18,  2,  9, 16, 21,  6,  1,  9, 21, 11,  9,  7, 10, 17,  2,  4,
          9, 16, 21, 20, 21,  3,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  4,  1,  9, 22, 10, 16, 16, 11,  9,  7, 21, 16, 16,  9, 24,  1,  9,
       

In [14]:
import numpy as np
# arr = np.zeros((len(btch),50))
arr = []
for i in range(len(batch[:, 1:])):
    row = []
    for j in range(len(batch[:, 1:][0])):
        if IND2CHAR[batch[:, 1:][i,j].item()] == 'none':
            break
        else:
            row.append(IND2CHAR[batch[:, 1:][i,j].item()])
#         arr[i,j] = IND2CHAR[btch[i,j].item()]
    arr.append(row)
print(arr)

[['o', ' ', 'a', 'c', 't', 'u', 'a', 'l', 'l', 'y', ' ', 'i', 't', ' ', 'w', 'a', 's', ' ', 'a', ' ', 'l', 'i', 't', 't', 'l', 'e', ' ', 'o', 'f', ' ', 'b', 'o', 't', 'h', ' ', 's', 'o', 'm', 'e', 't', 'i', 'm', 'e', 's', ' ', 'w', 'h', 'e', 'n'], ['h', 'e', 'r', 'e', 's', ' ', 'm', 'r', ' ', 'b', 'e', 'r', 'g', 's', 't', 'r', 'o', 'm'], [' ', 'd', 'o', 'n', 't', ' ', 'k', 'n', 'o', 'w', ' ', 'a', 'l', 't', 'h', 'o', 'u', 'g', 'h', ' ', 'i', 'd', ' ', 's', 'u', 'r', 'e', ' ', 'l', 'i', 'k', 'e', ' ', 't', 'o', ' ', 't', 'a', 'l', 'k', ' ', 't', 'o', ' ', 'h', 'i', 'm', ' ', 'h'], ['h', 'a', 't', ' ', 'l', 'i', 'f', 'e', ' ', 'i', 's', ' ', 'w', 'o', 'r', 't', 'h', ' ', 'l', 'i', 'v', 'i', 'n', 'g'], ['h', 'e', ' ', 'p', 'o', 'l', 'l', 's', ' ', 'w', 'i', 'l', 'l', ' ', 'b', 'e', ' ', 'o', 'p', 'e', 'n', ' ', 'f', 'r', 'o', 'm', ' ', 'n', 'o', 'w', ' ', 'u', 'n', 't', 'i', 'l', ' ', 't', 'h', 'e', ' ', 'e', 'n', 'd', ' ', 'o', 'f', ' ', 'r']]


## Embedding и RNN ячейки

Каждому токену мы хотим сопоставить не просто число, но вектор. Поэтому вектор текста нам нужно умножить на матрицу эмбеддингов, которая тоже будет учиться в процессе обучения нейросети. Для создания такой матрицы нам нужен слой `nn.Embedding`

In [15]:
X[0:5].shape

torch.Size([5, 50])

In [16]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова
t = embeddings(X[0:5])
t.shape

torch.Size([5, 50, 28])

In [17]:
t.shape, X[0:5].shape

(torch.Size([5, 50, 28]), torch.Size([5, 50]))

In [18]:
rnn = torch.nn.RNN(28, 128, batch_first=True)  # на вход - размер эмбеддинга, размер скрытого состояния и порядок размерностей
o, s = rnn(t)
# вектора для слов: батч * число токенов * размер скрытого состояния
# вектор скрытого состояния: число вектров (один) * батч * размер скрытого состояния
o.shape, s.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

Можно применять несколько рекуррентных ячеек подряд

In [19]:
o, s2 = rnn(t, s)
o.shape, s2.shape

(torch.Size([5, 50, 128]), torch.Size([1, 5, 128]))

## Реализация сети с RNN
3 слоя:
1. Embeding (30)
2. RNN (hidden_dim=128)
3. Полносвязный слой для предсказания буквы (28, то есть размер словаря)

In [20]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)

In [21]:
model = Network()

In [22]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [24]:
torch.cuda.empty_cache()
model.to(device)

Network(
  (embedding): Embedding(28, 30)
  (rnn): RNN(30, 128)
  (out): Linear(in_features=128, out_features=28, bias=True)
)

In [25]:
with torch.no_grad():
    print(X[:5].shape)
    print(X[:5][:, :-1].shape)
    print(X[:5][:, 1:].shape)
    print(model(X[:5][:, :-1].to(device)).shape)
    print(X[:5][:, 1:].flatten().shape)
    print(model(X[:5][:, :-1].to(device)).view(-1, len(INDEX_TO_CHAR)).shape)

torch.Size([5, 50])
torch.Size([5, 49])
torch.Size([5, 49])
torch.Size([5, 49, 28])
torch.Size([245])
torch.Size([245, 28])


Обучение:

In [26]:
batch_size = 200

In [29]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0
    
    for i in range(int(len(X) / batch_size)):
#     for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch = X[i * batch_size:(i + 1) * batch_size]
#         batch = X[i * 100:(i + 1) * 100]
        batch  =  batch.to(device)
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1
        
        torch.cuda.empty_cache()
        gc.collect()

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep+1, time.time() - start, train_loss / train_passed))

Epoch 1. Time: 88.843, Train loss: 1.902
Epoch 2. Time: 90.615, Train loss: 1.758
Epoch 3. Time: 89.584, Train loss: 1.730
Epoch 4. Time: 91.146, Train loss: 1.718
Epoch 5. Time: 89.689, Train loss: 1.710
Epoch 6. Time: 90.019, Train loss: 1.705
Epoch 7. Time: 90.106, Train loss: 1.701
Epoch 8. Time: 92.126, Train loss: 1.697
Epoch 9. Time: 92.939, Train loss: 1.693
Epoch 10. Time: 94.437, Train loss: 1.688
Epoch 11. Time: 92.348, Train loss: 1.685
Epoch 12. Time: 103.423, Train loss: 1.683
Epoch 13. Time: 95.715, Train loss: 1.681
Epoch 14. Time: 99.818, Train loss: 1.679
Epoch 15. Time: 96.081, Train loss: 1.677
Epoch 16. Time: 94.032, Train loss: 1.676
Epoch 17. Time: 91.382, Train loss: 1.674
Epoch 18. Time: 91.785, Train loss: 1.672
Epoch 19. Time: 91.231, Train loss: 1.671
Epoch 20. Time: 89.379, Train loss: 1.670


In [30]:
torch.cuda.empty_cache()
gc.collect()

0

In [44]:
mode.eval()
print(model(torch.tensor([CHAR_TO_INDEX[ch] for ch in 'dog'],device='cuda')).topk(1))

torch.return_types.topk(
values=tensor([[3.7670],
        [2.5037],
        [3.9435]], device='cuda:0', grad_fn=<TopkBackward0>),
indices=tensor([[19],
        [12],
        [ 5]], device='cuda:0'))


## Генерация


- Сначала отправлем в модель буквы из предложения (прогревая состояние)
- Затем берём самую вероятную букву и добавляем её в предложение
- Повторяем пока не получим none (0)

In [45]:
CHAR_TO_INDEX['none']

0

In [52]:
def generate_sentence(word):
    sentence = list(word)
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    answers = model.forward(torch.tensor(sentence, device=device))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [53]:
generate_sentence('dog')

' uo'

In [54]:
generate_sentence('It is')

'none tn '