In [1]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='D:/FAST.Ai/DL1/Data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

In [2]:
os.listdir(TRN)

['nietzsche.txt']

In [3]:
TEXT = data.Field(lower=True , tokenize=list) 
# "tokenize=list" for do that character by character , run list("abc")
bs = 64 ; bptt = 8 ;n_fac = 42 ; n_hidden = 256 
# it doesn't use "bptt = 8" always  , which is kind of data augmentation 
#but there is no over lap between each batch 

In [4]:
FILES = dict(train = TRN_PATH , validation = VAL_PATH , test = VAL_PATH)
md = LanguageModelData.from_text_files(PATH , TEXT , **FILES , bptt=bptt , bs = bs, min_freq = 3)

In [5]:
print("num of batches:" , len(md.trn_dl))

num of batches: 937


In [6]:
md.nt # num of unique letters in the vocabulary

55

In [7]:
TEXT.vocab.stoi #and "itos"

defaultdict(<function torchtext.vocab._default_unk_index()>,
            {'<unk>': 0,
             '<pad>': 1,
             ' ': 2,
             'e': 3,
             't': 4,
             'i': 5,
             'a': 6,
             'o': 7,
             'n': 8,
             's': 9,
             'r': 10,
             'h': 11,
             'l': 12,
             'd': 13,
             'c': 14,
             'u': 15,
             'f': 16,
             'm': 17,
             'p': 18,
             'g': 19,
             ',': 20,
             'y': 21,
             'w': 22,
             'b': 23,
             'v': 24,
             '-': 25,
             '.': 26,
             '"': 27,
             'k': 28,
             'x': 29,
             ';': 30,
             ':': 31,
             'q': 32,
             'j': 33,
             '!': 34,
             '?': 35,
             '(': 36,
             ')': 37,
             "'": 38,
             'z': 39,
             '1': 40,
             '2': 41,
             '=':

In [8]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
        # torchText will flatten the y for us outomaticly 
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [9]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512)
opt = optim.Adam(m.parameters(), 1e-3)

In [31]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      1.870818   1.855992  
    1      1.698993   1.71122                                                                                          
    2      1.611705   1.643448                                                                                         
    3      1.562987   1.606964                                                                                         



[array([1.60696])]

In [32]:
set_lrs(opt, 1e-4)

fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      1.485332   1.561225  
    1      1.480107   1.555912                                                                                         



[array([1.55591])]

# From the pytorch source

### we can steel face with gradient explosion with this

In [33]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

# GRU

In [10]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [12]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512)

opt = optim.Adam(m.parameters(), 1e-3)

In [13]:
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      1.749409   1.73483   
    1      1.565722   1.583839                                                                                         
    2      1.475071   1.524932                                                                                         
    3      1.432614   1.495572                                                                                         
    4      1.38773    1.477598                                                                                         
    5      1.361968   1.465187                                                                                         



[array([1.46519])]

# LSTM

In [14]:
from fastai import sgdr

n_hidden=512

In [15]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [16]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2)


lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5) 
# for using fast.ai stuff like  SGDR or call_back
# in code you are not using get learner 
# it is instead of optim.Adam()

In [18]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [19]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      1.57023    1.531775  


[array([1.53177])]

In [22]:
def get_next(inp):
    idxs = TEXT.numericalize(inp,)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [23]:
get_next('for thos')

AssertionError: 
The NVIDIA driver on your system is too old (found version 8000).
Please update your GPU driver by downloading and installing a new
version from the URL: http://www.nvidia.com/Download/index.aspx
Alternatively, go to: http://pytorch.org to install
a PyTorch version that has been compiled with your version
of the CUDA driver.

In [None]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [None]:
print(get_next_n('for thos', 400))