[View in Colaboratory](https://colab.research.google.com/gist/naviarh/582873f77f47ff0669b7d979495d36aa/fastai_rnn3.ipynb)

In [0]:
!pip install --upgrade pip
!pip3 install fastai

### Настройка платформы

In [37]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline
!pwd

/content


In [0]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

In [39]:
import subprocess, os
os.uname()

posix.uname_result(sysname='Linux', nodename='debdfa8970b1', release='4.14.33+', version='#1 SMP Wed Jun 20 01:15:52 PDT 2018', machine='x86_64')

In [40]:
from fastai.io import *
from fastai.structured import *
from fastai.column_data import *
from fastai.conv_learner import *
!pip3 show fastai torch | grep Name -A 1

Name: fastai
Version: 0.7.0
--
Name: torch
Version: 0.3.1


In [0]:
torch.cuda.set_device(0)
torch.cuda.get_device_name(0)

In [0]:
#from fastai.imports import *
#from fastai.transforms import *
#from fastai.conv_learner import *
#from fastai.model import *
#from fastai.dataset import *
#from fastai.sgdr import *
#from fastai.plots import *
#!pip3 show fastai torch | grep Name -A 1

In [0]:
!pip3 install spacy

In [0]:
from torchtext import vocab, data
from fastai.nlp import *
from fastai.lm_rnn import *

### Начало

In [41]:
# Организуем файловую структуру
PATH='data/nietzsche/'
TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'
os.makedirs(TRN, exist_ok=True)
os.makedirs(VAL, exist_ok=True)
os.makedirs(f'{PATH}models', exist_ok=True)
%ls {PATH}

[0m[01;34mmodels[0m/  nietzsche.txt  [01;34mtrn[0m/  [01;34mval[0m/


In [51]:
# Загрузим текст
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
# Прочитаем
text = open(f'{PATH}nietzsche.txt').read()
# Разделим 80% / 20%
t80 = text[:-len(text)//5]
t20 = text[len(t80):]

len(text), len(t80), len(t20)

(600893, 480714, 120179)

In [56]:
# Создадим тренировочный и валидационный тексты
open(f'{TRN}trn.txt','w').write(t80)
open(f'{VAL}val.txt','w').write(t20)
%ls {TRN} {VAL}

data/nietzsche/trn/:
trn.txt

data/nietzsche/val/:
val.txt


In [70]:
# Предобработчик данных из модуля torchtext
# в нижний регистр и токенизация по символам (функция list("string"))
# Параметр tokenize=list означает, что минибатчи состоят из символов
TEXT = data.Field(lower=True, tokenize=list)
TEXT

<torchtext.data.field.Field at 0x7f52aff8ad68>

In [0]:
bs=64 # размер минибатча
bptt=8 # длина рекурсии (размер бэкпропегейшн по времени)
n_fac=42 # размер эмбеддинга
n_hidden=256 # размер скрытого слоя

In [69]:
# Определим подпапки тренировочной, проверочной, и тестовой выборок
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
FILES

{'test': 'val/', 'train': 'trn/', 'validation': 'val/'}

### Объект данных

In [71]:
# Объект данных модели (min_freq=3 - игнор символов, встречающижся реже 3 раз)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)
md

<fastai.nlp.LanguageModelData at 0x7f52affd9f60>

In [105]:
# Можем пользоваться словарём перевода символов в их индексы
TEXT.vocab.stoi['e']

3

In [106]:
# Можем пользоваться переводом индексов в символы
TEXT.vocab.itos[3:10]

['e', 't', 'i', 'a', 'o', 'n', 's']

In [73]:
# количество минибатчей, должно равнятся (количество токенов)/bs/bptt
# Если колиество токенов не кратно bptt*bs, то последний минибатч короче
# PyTorch выдерживает bptt=8, а в 5% случаев немного меняет, сохраняя среднее значение
len(md.trn_dl)

922

In [75]:
# Количество уникальных токенов (символов)
md.nt

55

In [111]:
# Количество тренировочных наборов и его размер
len(md.trn_ds), len(md.trn_ds[0].text)

(1, 472943)

### Модель RNN

In [0]:
# Описание модели
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        # создание скрытого слоя перенесли сюда в конструктор
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        # В конце каждой эпохи минибатч может быть неполным
        # Если минибатч неполный, то инициализируем:
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h) # сохранение активаций без градиентов (stop backpropagation)
        # Это backpropagation во времени (BPTT)
        # Прогнозируемые данные преобразовываем для передачи в функцию потерь:
        # dim=-1 - по какой оси (по последней) вычислять log_softmax
        # .viev() - распрямляем матрицу l_out размера bs*bptt
        # Предсказания приводим к нужному фромату здесь,
        # а целевую переменную к нужному формату приводит torchtext
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [114]:
# Модель и оптимизатор
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
m

CharSeqStatefulRnn(
  (e): Embedding(55, 42)
  (rnn): RNN(42, 256)
  (l_out): Linear(in_features=256, out_features=55, bias=True)
)

Тренируем модель

In [116]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss   
    0      1.880135   1.868499  
 24%|██▍       | 225/922 [00:06<00:18, 36.71it/s, loss=1.82]

    1      1.702985   1.723049  


    2      1.619925   1.645939  
    3      1.569367   1.611201  



[array([1.6112])]

In [0]:
set_lrs(opt, 1e-4)

In [119]:
fit(m, md, 5, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss   
    0      1.491326   1.566226  
 24%|██▍       | 223/922 [00:05<00:18, 38.04it/s, loss=1.51]

    1      1.490891   1.56062   


    2      1.486255   1.556297  


    3      1.479677   1.552505  
    4      1.480671   1.54902   



[array([1.54902])]

**Тестируем**

In [0]:
# Функция прогнозирования символа
def get_next(inp):
    idxs = TEXT.numericalize(inp, device=-1)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [132]:
get_next('for thos')

'e'

In [0]:
# Яункция генерация текста
def get_next_n(inp, n):
    res = inp
    for _ in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [131]:
get_next_n('for thos', 400)

'for those sum;" outforserve sense mayfachologism to present. there witled the noble, the delextining us be the everything force, loce himself gay, or some race all that [not, cancomilarinative solonr-gindiationation of dardhopt. such mankind andgood and such has upon and action is be soaget.[[1. paste, for an ead of gy stand; "man approves age thathe believent comes rable manyfoptic sentimenttyhis pothing'

### Модель RNN loop

In [0]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [0]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [135]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
m

CharSeqStatefulRnn2(
  (e): Embedding(55, 42)
  (rnn): RNNCell(42, 256)
  (l_out): Linear(in_features=256, out_features=55, bias=True)
)

In [136]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss   
    0      1.877432   1.878375  
 23%|██▎       | 214/922 [00:06<00:21, 33.62it/s, loss=1.8]

    1      1.693341   1.721747  


    2      1.605935   1.644849  
    3      1.556723   1.605359  



[array([1.60536])]

**Тестируем**

In [137]:
get_next('for thos')

'e'

In [138]:
get_next_n('for thos', 400)

'for those and still "gemen the present, the tasking it is not, a deliciouthants wirdless and_what is fix all germant with at virtue and illunations, a woran thougother noflemen when betoisians at make whatnese theirsor morality of his pristians ofvidias define and evilest false the same of physilly how for the "preservity oftheameralbout the worthe wirdsward parvubbeefolofterrights a "awardsomenture and s'

### Модель GRU

In [0]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [0]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [141]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)
m

CharSeqStatefulGRU(
  (e): Embedding(55, 42)
  (rnn): GRU(42, 256)
  (l_out): Linear(in_features=256, out_features=55, bias=True)
)

In [142]:
fit(m, md, 7, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=7), HTML(value='')))

epoch      trn_loss   val_loss   
    0      1.76034    1.752509  
 16%|█▌        | 143/922 [00:08<00:46, 16.74it/s, loss=1.72]

    1      1.571539   1.600048  


    2      1.478357   1.532175  


    3      1.42769    1.501683  


    4      1.391096   1.478896  


    5      1.357312   1.470548  
    6      1.328808   1.466279  



[array([1.46628])]

**Тестируем**

In [143]:
get_next('for thos')

'e'

In [144]:
get_next_n('for thos', 400)

'for those ratisity and granted without theset ismatter and stupidify, the everylogical eddibure. lessing flowssithingonestiness ill to may belong, andif world of weal of redison all our whillse challer tobelief of the artistic and the good continus offluonedly that difficulourilyto only the that forget that he is the bolds refuge as an abjugin true, under as any other perhaps loved?--is it: although have '

### Putting it all together: LSTM

In [0]:
from fastai import sgdr

n_hidden=512

In [0]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [148]:

m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)
m

CharSeqStatefulLSTM(
  (e): Embedding(55, 42)
  (rnn): LSTM(42, 512, num_layers=2, dropout=0.5)
  (l_out): Linear(in_features=512, out_features=55, bias=True)
)

In [0]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [150]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))



epoch      trn_loss   val_loss   
    0      1.797133   1.724761  
 43%|████▎     | 394/922 [03:06<04:09,  2.12it/s, loss=1.74]

    1      1.680378   1.617963  



[array([1.61796])]

In [0]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

In [0]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

**Тестируем**

In [151]:
get_next('for thos')

'e'

In [152]:
get_next_n('for thos', 400)

'for those human the passitude (worldwith ruto, nowledge ir never--not thought "is it is racks of the trubtness ownor valuagedand is perhaps from the now and laoked soul every unvertedness in it, new been sensed the great, no laught and stand only satic somethors lufferful. that sciels of cirtuencially would all owor and sever: not are doct in a niseful, ploy--would morespeliation of most good fixner:" but'