In [2]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [3]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [4]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [5]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [6]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [7]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [8]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [9]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [10]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
    optim.step()

5.747655868530273
Clip gradient :  2.8633652854182223
2.580684185028076
Clip gradient :  1.1390055667942147
1.6786515712738037
Clip gradient :  0.7824824650247073
1.4024085998535156
Clip gradient :  0.23069135309706662
1.2504534721374512
Clip gradient :  0.6627486907084564
0.46610116958618164
Clip gradient :  3.3151692164936426
3.035186529159546
Clip gradient :  3.612239102185101
2.026512384414673
Clip gradient :  1.5675161791218615
1.8561747074127197
Clip gradient :  3.25035965167037
1.6936414241790771
Clip gradient :  0.8820896676860384


# Тестирование

In [11]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 heell
Original:	 hello


AssertionError: 

# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [12]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [23]:
#Написать реализацию LSTM и обучить предсказывать слово
class LSTM(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super(LSTM, self).__init__()
    
        self.n_a = hidden_size
        self.n_x = in_size
        self.n_y = out_size
        
        self.forget_gate = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Sigmoid())
        self.update_gate = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Sigmoid())
        self.output_gate = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Sigmoid())
        self.cand_cell  = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Tanh())
        self.out_weight = nn.Linear(hidden_size, out_size)
        
        self.hidden_activation = nn.Tanh()
        
    def forward(self, x, prev_hidden):
        concat = torch.cat((prev_hidden.squeeze(0), x.squeeze(0)))
        
        ft = self.forget_gate(concat)
        it = self.update_gate(concat)
        ot = self.output_gate(concat)
        
        cct = self.cand_cell(concat)
        c_next = cct * it + prev_hidden * ft
        a_next = ot * self.hidden_activation(c_next)
        
        output = self.out_weight(a_next).unsqueeze(0)
        return output, a_next
    

## Инициализация переменных 

In [24]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=100, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(lstm.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [25]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(lstm.n_a)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = lstm(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=1)
    
    optim.step()

70.78539276123047
Clip gradient :  2.8101644130156904
64.78363037109375
Clip gradient :  2.6121712946733946
38.71027755737305
Clip gradient :  4.894155420342544
11.478543281555176
Clip gradient :  6.055805060897508
3.4089574813842773
Clip gradient :  4.454494222721922
1.274489402770996
Clip gradient :  3.3726281853669455
0.25881481170654297
Clip gradient :  0.6689003434111417
0.07451248168945312
Clip gradient :  0.14222118346577128
0.03605842590332031
Clip gradient :  0.05526603356976821
0.020552635192871094
Clip gradient :  0.03007577938131145


# Тестирование

In [29]:
lstm.eval()
hh = torch.zeros(lstm.n_a)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = lstm(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [34]:
#Написать реализацию GRU и обучить предсказывать слово
class GRU(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super(GRU, self).__init__()
    
        self.n_a = hidden_size
        self.n_x = in_size
        self.n_y = out_size
        
        self.update_gate = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Sigmoid())
        self.relevance_gate = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Sigmoid())
        self.candidate_cell  = nn.Sequential(nn.Linear(in_size + hidden_size, hidden_size), nn.Tanh())
        self.out_weight = nn.Linear(hidden_size, out_size)
        
        self.hidden_activation = nn.Tanh()
        self.candiate_activation = nn.Tanh()
        
    def forward(self, x, prev_hidden):
        a_prev = prev_hidden.squeeze(0)
        concat = torch.cat((a_prev, x.squeeze(0)))
        
        update_gate = self.update_gate(concat)
        relevance_gate = self.relevance_gate(concat)
        rel_candidate = relevance_gate * a_prev
        
        concat = torch.cat((rel_candidate, x.squeeze(0)))
        candidate = self.candidate_cell(concat)
        
        c_next = candidate * (1 - update_gate) + update_gate * a_prev
        output = self.out_weight(c_next).unsqueeze(0)
        return output, c_next

## Инициализация переменных 

In [35]:
ds = WordDataSet(word=word)
gru = GRU(in_size=ds.vec_size, hidden_size=100, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(gru.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [36]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(gru.n_a)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = gru(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=1)
 
    optim.step()

70.8595199584961
Clip gradient :  4.2992556486134434
50.59659957885742
Clip gradient :  8.415338562987163
22.12432289123535
Clip gradient :  18.955736461376528
21.041790008544922
Clip gradient :  24.87605190186161
15.885137557983398
Clip gradient :  26.549641262734173
7.04307222366333
Clip gradient :  12.520580408605676
5.053359508514404
Clip gradient :  9.87925342843521
4.061925411224365
Clip gradient :  7.368254379531738
0.5850315093994141
Clip gradient :  1.533113124286375
0.0706939697265625
Clip gradient :  0.20630092655106183


# Тестирование

In [37]:
gru.eval()
hh = torch.zeros(gru.n_a)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = gru(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
