In [22]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [23]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [24]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [25]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [26]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [27]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [28]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [29]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [30]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

6.241093635559082
Clip gradient :  2.9952871038773936
1.9175242185592651
Clip gradient :  1.0000687721702373
0.053429603576660156
Clip gradient :  0.21417662657747444
0.006561756134033203
Clip gradient :  0.026923841330445467
0.003204822540283203
Clip gradient :  0.013663283545834303
0.002155303955078125
Clip gradient :  0.004036101400384051
0.0018548965454101562
Clip gradient :  0.00474431845079919
0.0016655921936035156
Clip gradient :  0.0030348992287075266
0.0015568733215332031
Clip gradient :  0.002928868256854937
0.0014696121215820312
Clip gradient :  0.0026265569733631773


# Тестирование

In [31]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [33]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [34]:
class LSTM(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()
        
        self.Wxc = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whc = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.Wxi = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whi = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.Wxf = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whf = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.Wxo = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Who = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.activation_t  = nn.Tanh()
        self.activation_si = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, h_prev, ct_prev):
        ct_ = self.activation_t(self.Wxc(x) + self.Whc(h_prev))
        it = self.activation_si(self.Wxi(x) + self.Whi(h_prev))
        ft = self.activation_si(self.Wxf(x) + self.Whf(h_prev))
        ot = self.activation_si(self.Wxo(x) + self.Who(h_prev))

        ct = ft * ct_prev + it * ct_
        ht = ot * self.activation_t(ct)
        
        output = self.outweight(ht)
        return output, ht, ct

In [38]:
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, hidden_size=9, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(lstm.parameters(), lr = 0.1, momentum=0.9)

In [39]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(lstm.Whc.in_features)
    ct = torch.zeros(lstm.Whc.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh, ct = lstm(x, hh, ct)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=1)
            
    optim.step()

71.27651977539062
Clip gradient :  3.2333055592902125
64.65594482421875
Clip gradient :  4.671078924658394
44.599403381347656
Clip gradient :  8.193819036041655
33.583961486816406
Clip gradient :  13.216542360622869
22.000526428222656
Clip gradient :  7.019817953584795
12.808025360107422
Clip gradient :  7.344742214919685
9.923479080200195
Clip gradient :  7.831807558328731
6.379441261291504
Clip gradient :  6.17292959923518
3.757295608520508
Clip gradient :  5.158148500315033
1.7752985954284668
Clip gradient :  2.370055276139136


In [43]:
lstm.eval()
hh = torch.zeros(lstm.Whc.in_features)
ct = torch.zeros(lstm.Whc.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh, ct = lstm(x, hh, ct)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [129]:
class GRU(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()
        
        self.Wxu = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whu = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.Wxr = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whr = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.Wxh_ = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.Whh_ = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.activation_t  = nn.Tanh()
        self.activation_si = nn.Sigmoid()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, h_prev):
        ut = self.activation_t(self.Wxu(x) + self.Whu(h_prev))
        rt = self.activation_si(self.Wxr(x) + self.Whr(h_prev))
        h_t = self.activation_t(self.Wxh_(x) + self.Whh_(rt * h_prev))
        
        ht = (torch.ones(ut.size()) - ut) * h_t + ut * h_prev
        
        output = self.outweight(ht)
        return output, ht

In [136]:
ds = WordDataSet(word=word)
gru = GRU(in_size=ds.vec_size, hidden_size=9, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
lr        = 0.01
optim     = SGD(gru.parameters(), lr = lr, momentum=0.9)

In [137]:
CLIP_GRAD = True


for epoch in range(e_cnt):
    hh = torch.zeros(gru.Whu.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = gru(x, hh)
        
        loss += criterion(y, target)
        
    loss.backward()
    
    if epoch % 150 == 0 and epoch != 0:
        lr = lr / 2
        optim = SGD(gru.parameters(), lr = lr, momentum=0.9)
        print('Clipped lr')

    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=1)
            
    optim.step()

71.15213775634766
Clip gradient :  5.694520194498311
67.86914825439453
Clip gradient :  4.728119251044438
62.29629898071289
Clip gradient :  6.898049172334389
53.960670471191406
Clip gradient :  11.60842783438284
42.48074722290039
Clip gradient :  17.097925122630027
31.662235260009766
Clip gradient :  24.93448841701567
24.05257797241211
Clip gradient :  37.16940087202575
18.845861434936523
Clip gradient :  72.69995957023778
16.932872772216797
Clip gradient :  43.58018625849628
14.781505584716797
Clip gradient :  106.4866426979809
13.825221061706543
Clip gradient :  23.87333194149107
9.120635986328125
Clip gradient :  80.56539492775272
11.601009368896484
Clip gradient :  132.87296204974623
7.797486305236816
Clip gradient :  37.10470078667578
8.004602432250977
Clip gradient :  50.45964464025645
Clipped lr
7.927195072174072
Clip gradient :  95.60890427686239
8.688392639160156
Clip gradient :  76.7301522032651
5.0349578857421875
Clip gradient :  25.857105062380043
4.002920627593994
Clip gr

In [138]:
gru.eval()
hh = torch.zeros(gru.Whu.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = gru(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
