# Языковые модели 
### из эпохи до того момента, когда нейронные сети вдруг стали диплёрнингом


In [60]:
# import warnings
# warnings.filterwarnings('ignore')

# import urllib
# import urllib.request
# import nltk
# from nltk.tokenize import word_tokenize

# from bs4 import BeautifulSoup

# print("Downloading")

# txt = urllib.request.urlopen("https://www.gutenberg.org/files/2600/2600-h/2600-h.htm").read()
# txt2 = urllib.request.urlopen("http://www.gutenberg.org/files/36028/36028-h/36028-h.htm").read()

# txt = str(txt) + " " + str(txt2)

# print("Parsing")
# soup = BeautifulSoup(txt)

# print("Cleaning")
# txt = soup.find('body').get_text()

# print(txt[:100])

In [19]:
txt = open("file1.txt", encoding='utf-8').read() + " " + open("tomasman.txt", encoding='utf-8').read()

In [10]:
import re
from collections import Counter

BOS = "BOS"
EOS = "EOS"
UNK = "UNK"

def prepare_sentences(txt, word_threshold=2, stage_train=True):

    # вычищаем переносы
    whitespaces = re.compile("\s+", re.U)
    txt = re.sub("\s+", " ", txt).lower()

    # убираем всё, кроме "слов", разбив на предложения
    sentences = re.split("[!\?\.]+", txt.replace("\n", " "))
    
    # оставляем только alphanumeric
    clean_sentences = [re.split("\W+", s) for s in sentences]
    
    # заменяем числа на NUM
    clean_sentences = [[w.replace("\d+", "NUM") for w in s if w] for s in clean_sentences]
    
    # вводим тег UNKNOWN: UNK
    if stage_train:

        counter = Counter()

        for s in clean_sentences:
            for w in s:
                counter[w] += 1
    
        print("Filtered out word types :", len([w for w in counter if counter[w] <= word_threshold]))
        print("Filtered out words count:", sum([counter[w] for w in counter if counter[w] <= word_threshold]))
    
        # выкидываем редкие, и заменяем их на специальный тег
        clean_sentences = [[w if counter[w] > word_threshold else UNK for w in s] for s in clean_sentences]            
    
    word2index = { BOS: 0, EOS: 1, UNK: 2}
    index2word = { 0: BOS, 1: EOS, 2: UNK}
    
    counter = max(word2index.values()) + 1

    for s in clean_sentences:
        for w in s:
            if not w in word2index:
                word2index[w] = counter
                index2word[counter] = w
                counter += 1
                
    return word2index, index2word, clean_sentences

In [20]:
word2index, index2word, clean_sentences = prepare_sentences(txt)

print("Total number of sentences :\t", len(clean_sentences))
print("Total number of words     :\t", sum([len(sent) for sent in clean_sentences]))
print("Total number of word types:\t", len(set([w for sent in clean_sentences for w in sent])))

Filtered out word types : 5530
Filtered out words count: 7039
Total number of sentences :	 5706
Total number of words     :	 111698
Total number of word types:	 3708


In [21]:
for s in map(lambda x: " ".join(x), clean_sentences[100:1010]):
    print(s)

alone or with her husband
inquired matvey
stepan arkadyevitch could not answer as the barber was at work on his upper lip and he raised one finger
matvey nodded at the looking glass
alone
is the room to be got ready upstairs
inform darya alexandrovna where she orders
darya alexandrovna
matvey repeated as though in doubt
yes inform her
here take the telegram give it to her and then do what she UNK you
you want to try it on matvey understood but he only said yes sir
stepan arkadyevitch was already UNK and UNK and ready to be dressed when matvey stepping deliberately in his UNK boots came back into the room with the telegram in his hand
the barber had gone
darya alexandrovna told me to inform you that she is going away
let him do that is you as he likes he said laughing only with his eyes and putting his hands in his pockets he watched his master with his head on one side
stepan arkadyevitch was silent a minute
then a good humored and rather pitiful smile showed itself on his handsome fac

In [22]:
def augment(sentence, context_size):
    """
        Добиваем символы начала и конца строки к каждому предложению
    """
    return [BOS] * context_size + sentence + [EOS] * context_size

def enumerate_sentences(clean_sentences, context_size, word2index):
    """
        Добиваем символами начала и конца и конвертируем слова в индексы
    """

    contexts = []
    targets = []
    UNK_id = word2index[UNK]

    for sentence in clean_sentences:

        aligned_sentence =  augment(sentence, context_size) 

        for i in range(context_size, len(sentence) - context_size, 1):
            
            # берём предшествующий контекст
            context = aligned_sentence[i - context_size:i]
            context = [word2index[c] if c in word2index else UNK_id for c in context]
            target = word2index[aligned_sentence[i]] if aligned_sentence[i] in word2index else UNK_id
            
            contexts.append(context)
            targets.append(target)
    
    return contexts, targets


Как бить на батчи заданного размера

In [23]:
def chunks(l0, l1, n):
    
    assert len(l0) == len(l1)
    coll0, coll1 = [], []
    
    for i in range(0, len(l0), n):
        coll0.append(l0[i:i + n])
        coll1.append(l1[i:i + n])
        
    return coll0, coll1

## Модель

In [24]:
from collections import defaultdict
from tqdm import tqdm_notebook
from functools import lru_cache

class NGramFreqsLanguageModeler(object):
    
    def __init__(self, vocab_size, context_size):
        super(NGramFreqsLanguageModeler, self).__init__()
    
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.ngram_dict = defaultdict(lambda: defaultdict(lambda: 0))        
        self.n_1_gram_dict = defaultdict(lambda: defaultdict(lambda: 0))
        self.contexts_counts = defaultdict(lambda: 0)
        self.eps = 1.0
    
    def fit(self, contexts, targets):
        
        self.contexts_counts = defaultdict(lambda: 0)
        
        for c, t in zip(contexts, targets):
            c = tuple(c)
            self.ngram_dict[c][t] += 1
            self.contexts_counts[c] += 1
            
            # намёк!
            # self.n_1_gram_dict[c[1:]][t] += 1

            
        print("Total n-1 grams", len(self.ngram_dict), list(self.ngram_dict)[:10])
        
        # нормализуем частоты
        for c in tqdm_notebook(self.ngram_dict.keys()):
            for t in self.ngram_dict[c]:
                self.ngram_dict[c][t] = (self.ngram_dict[c][t] +  self.eps) / \
                                            (self.contexts_counts[c] + self.vocab_size * self.eps)
        
    @lru_cache(1000000)
    def prob_dist(self, input_context):
        """
            Takes ngram as a tuple
        """
        
        probs = np.zeros(self.vocab_size) + \
                    self.eps / (self.vocab_size * self.eps + self.contexts_counts[input_context])
        
        counts = self.ngram_dict[input_context]
        
        # если есть хоть какие-то счётчики
        if counts:
            
            # проставим осмысленные частоты
            for target, freq in counts.items():
                probs[target] = freq
                
        return probs

In [25]:
CONTEXT_SIZE = 10
BATCH_SIZE = 1024

In [26]:
from tqdm import tqdm_notebook
import numpy as np

# строим контексты и цели
contexts, targets = enumerate_sentences(clean_sentences, CONTEXT_SIZE, word2index)

batches = list(zip(contexts, targets))

# simple_model = NGramFreqsLanguageModeler(context_size=CONTEXT_SIZE, vocab_size=len(word2index))
# simple_model.fit(contexts, targets)

## Оценка качества

здесь мы готовим тестовую выборку -- тексты, которых наша модель никогда не видела

In [27]:
import urllib.request

# test_txt = urllib.request.urlopen("http://www.gutenberg.org/files/1399/1399-0.txt")
# test_txt = test_txt.read().decode("utf-8")
test_txt = open("file2.txt").read()

_, _, test_clean_sentences = prepare_sentences(test_txt, stage_train=False)

print("Total number of sentences :\t", len(test_clean_sentences))
print("Total number of words     :\t", sum([len(sent) for sent in test_clean_sentences]))
print("Total number of word types:\t", len(set([w for sent in test_clean_sentences for w in sent])))

# строим контексты и цели
test_contexts, test_targets = enumerate_sentences(test_clean_sentences, CONTEXT_SIZE, word2index)

# test_data = list(zip(test_contexts, test_targets))

test_batched_contexts, test_batched_targets = chunks(test_contexts, test_targets, BATCH_SIZE)
test_batches = list(zip(test_batched_contexts, test_batched_targets))

len(test_contexts), len(test_targets), len(test_batches), len(word2index)

FileNotFoundError: [Errno 2] No such file or directory: 'file2.txt'

### Перплексия 
То, насколько хорошо наша модель приближает законы реального мира: какую вероятность порождения тестового текста, нормализованную числом слов, покажет наша модель.

Имеет трактовку с точки зрения теории информации: два в степени, равной приближению кросс-энтропии последовательности событий-слов. То есть такая оценка энтропии текста как последовательности.

#### это два в степени равной нашей невязке

$$PP(X) = \sqrt[N]{\frac{1}{P(x_1,...,x_N)}} = 2^{-\frac{1}{N}\sum_{i=1}^{N}\log{P(x_i|...)}} $$

In [15]:
import torch
import tqdm
from tqdm import tqdm_notebook

def compute_ppl_count_model(model, test_batches, nllloss):
    
    total_loss = 0
    count = 0

    for context_batch, target_batch in tqdm_notebook(test_batches):
        
        log_probs = []
        
        for context, target in zip(context_batch, target_batch):
            
            # применяем модель
            log_probs.append(np.log2(model.prob_dist(tuple(context))))
            
        log_probs = np.array(log_probs)
        
        # вычисляем невязку
        loss = nllloss(torch.tensor(log_probs, dtype=torch.float).cuda(), 
                       torch.tensor(target_batch, dtype=torch.long).cuda())
        
        # получаем число
        total_loss += loss.item()
        count += 1
        
        if count % (len(test_batches) // 5) == 0:
            print(count, "\tnll", total_loss)
            
            print("TRUE:", " ".join([index2word[i] for i in context_batch[0]]), "->", 
                  index2word[target_batch[0]])
            print("PREDICTED:", 
                  " ".join([index2word[i] for i in context_batch[0]]), "->",
                  "/".join([index2word[i] for i in (-log_probs[0]).argsort()[:3]])
                 )
    
    return 2 ** (total_loss / count)        

In [27]:
# from torch.nn import NLLLoss

# "Perplexity of freq-based NGram model on test set", compute_ppl_count_model(model=simple_model, 
#                                                                             nllloss=NLLLoss(),
#                                                                             test_batches=test_batches)

## Пусть наша модель погенерирует что-нибудь

In [45]:
# test = "BOS"
# prepared_text = augment(prepare_sentences(test, stage_train=False)[2][0], CONTEXT_SIZE)[-CONTEXT_SIZE:]

# for i in range(CONTEXT_SIZE, 10 + CONTEXT_SIZE):
    
#     idx = [word2index[w] for w in prepared_text[:i]]    
    
#     predict = simple_model.prob_dist(tuple(idx[-CONTEXT_SIZE:])) 
    
# #     predict = predict - predict.min()  
# #     predict /= sum(predict)
    
#     selected_word = np.random.choice(a=list(range(len(word2index))), p=predict)    
#     prepared_text.append(index2word[selected_word])
    
#     print("Генерация:", " ".join(prepared_text[CONTEXT_SIZE - 1:]))

# Нейронная вероятностная модель Bengio et al. 2003

In [77]:
import torch.nn
from torch import nn
from torch.nn import functional as F


class NGramMLPLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NGramMLPLanguageModeler, self).__init__()
        
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, 
                                       embedding_dim=embedding_dim)
        
        self.dropout = nn.Dropout(p=0.8)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.activation = nn.Tanh() 
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)       

    def forward(self, inputs):
        # по сути конкатенация эмбеддингов
        embeds = self.embeddings(inputs)        
        embeds = embeds.view((-1, self.linear1.in_features))
        
        # выбрасываем часть значений
        embeds = self.dropout(embeds)
        
        # линейная трансформация
        transformed_embeds = self.linear1(embeds)
        
        # применяем нелинейность к слою
        activated_transformed_embeds = self.activation(transformed_embeds)
        
        # применяем второй линейный слой
        out = self.linear2(activated_transformed_embeds)
        
        # применяем софтмакс
        log_probs = self.log_softmax(out)
        
        return log_probs

In [78]:
import torch
from torch import optim

losses = []
loss_function = nn.NLLLoss()
vocab_size = len(word2index)

EMBEDDING_DIM = 45
HIDDEN_DIM = 300

mlp_model = NGramMLPLanguageModeler(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM).cuda()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.0001)

mlp_model, optimizer

(NGramMLPLanguageModeler(
   (embeddings): Embedding(3710, 45)
   (dropout): Dropout(p=0.8)
   (linear1): Linear(in_features=450, out_features=300, bias=True)
   (activation): Tanh()
   (linear2): Linear(in_features=300, out_features=3710, bias=True)
   (log_softmax): LogSoftmax()
 ), Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 0.0001
     weight_decay: 0
 ))

## Обучение

In [92]:
from tqdm import tqdm_notebook
import numpy as np

EPOCHS = 100

# строим контексты и цели
contexts, targets = enumerate_sentences(clean_sentences, CONTEXT_SIZE, word2index)
batched_contexts, batched_targets = chunks(contexts, targets, BATCH_SIZE)

# смысла нет, просто иначе не получается отслеживать прогресс
batches = list(zip(batched_contexts, batched_targets))

def train_routine(model, loss_function, batches, epochs=30):

    for epoch in range(EPOCHS):

        total_loss = 0
        count = 0

        for context_batch, target_batch in tqdm_notebook(batches):

            context_idxs = torch.tensor(context_batch, dtype=torch.long).cuda()

            # градиенты надо сбрасывать, если не хотим аккумулировать
            model.zero_grad()

            # применяем модель
            log_probs = model(context_idxs)

            # вычисляем невязку
            loss = loss_function(log_probs, torch.tensor(target_batch, dtype=torch.long).cuda())

            # обратный проход, обновление градиента
            loss.backward()
            optimizer.step()

            # получаем число
            total_loss += loss.item()
            count += 1

        print("E", epoch + 1, "\tNLL\t", total_loss / count)

        losses.append(total_loss)
        
    return model, losses

In [93]:
mlp_model, losses = train_routine(mlp_model, loss_function, batches, EPOCHS)

E 1 	NLL	 6.193201131290859
E 2 	NLL	 6.167322410477532
E 3 	NLL	 6.143521308898926
E 4 	NLL	 6.132787691222297
E 5 	NLL	 6.118848204612732
E 6 	NLL	 6.09454411930508
E 7 	NLL	 6.074065181944105
E 8 	NLL	 6.078673548168606
E 9 	NLL	 6.057192458046807
E 10 	NLL	 6.049572507540385
E 11 	NLL	 6.039563907517327
E 12 	NLL	 6.021483990881178
E 13 	NLL	 6.008075912793477
E 14 	NLL	 5.987767563925849
E 15 	NLL	 5.980815158949958
E 16 	NLL	 5.966244300206502
E 17 	NLL	 5.944536487261455
E 18 	NLL	 5.945093830426534
E 19 	NLL	 5.932637731234233
E 20 	NLL	 5.91390675968594
E 21 	NLL	 5.902575837241279
E 22 	NLL	 5.889794482125176
E 23 	NLL	 5.882018791304694
E 24 	NLL	 5.867645806736416
E 25 	NLL	 5.855064445071751
E 26 	NLL	 5.862589081128438
E 27 	NLL	 5.848925153414409
E 28 	NLL	 5.829553259743585
E 29 	NLL	 5.824772146013048
E 30 	NLL	 5.807513210508558
E 31 	NLL	 5.7935891018973456
E 32 	NLL	 5.790502958827549
E 33 	NLL	 5.7887709273232355
E 34 	NLL	 5.768862883249919
E 35 	NLL	 5.7620275815

### Вычисление перплексии для нейронной модели на pytorch


In [94]:
def compute_ppl(model, test_batches, nllloss):
    
    total_loss = 0
    count = 0

    for context_batch, target_batch in tqdm_notebook(test_batches):
        
        context_idxs = torch.tensor(context_batch, dtype=torch.long).cuda()

        # применяем модель
        log_probs = model(context_idxs)

        # вычисляем невязку
        loss = nllloss(log_probs, torch.tensor(target_batch, dtype=torch.long).cuda())

        # получаем число
        total_loss += loss.item()
        count += 1
    
    return 2 ** (total_loss / count)        

In [97]:
"Perplexity on train set", compute_ppl(mlp_model, batches, loss_function)

('Perplexity on train set', 40.6134744159786)

In [98]:
"Perplexity on test set", compute_ppl(mlp_model, test_batches, loss_function)

('Perplexity on test set', 44.82330986074508)

### Погенерируем следующее слово

In [100]:
test = "truth is a woman love war peace lady"

prepared_text = augment(prepare_sentences(test, stage_train=False)[2][0], CONTEXT_SIZE)

for i in range(CONTEXT_SIZE, len(prepared_text) - CONTEXT_SIZE):
    print("-------------------------------")
    idx = [word2index[w] for w in prepared_text[:i]]
    
    predict = mlp_model(torch.tensor(idx[-CONTEXT_SIZE:]).cuda()).detach().cpu().numpy()
    
    print("argmax", predict.argmax(), "-> [", index2word[predict.argmax()], "] logprob", predict.max())
    print(" ".join(prepared_text[:i]), "=>", index2word[predict.argmax()])

-------------------------------
argmax 162 -> [ he ] logprob -2.247387
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS => he
-------------------------------
argmax 3 -> [ the ] logprob -2.2071738
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth => the
-------------------------------
argmax 68 -> [ was ] logprob -2.2548432
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth is => was
-------------------------------
argmax 3 -> [ the ] logprob -2.046276
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth is a => the
-------------------------------
argmax 3 -> [ the ] logprob -2.5053058
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth is a woman => the
-------------------------------
argmax 3 -> [ the ] logprob -2.1274953
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth is a woman love => the
-------------------------------
argmax 3 -> [ the ] logprob -2.8458571
BOS BOS BOS BOS BOS BOS BOS BOS BOS BOS truth is a woman love war => the
-------------------------------
argmax 75 -> [ that ] logprob -2.823176
BOS BOS BOS 

## "Голубое сало" 
#### Предложим модели погаллюцинировать, рипс нимада табень

In [103]:
test = "why"
prepared_text = augment(prepare_sentences(test, stage_train=False)[2][0], CONTEXT_SIZE)[-CONTEXT_SIZE:]

for i in range(CONTEXT_SIZE, 25):
    
    idx = [word2index[w] for w in prepared_text[:i]]    
    predict = 2.718281828 ** (mlp_model(torch.tensor(idx[-CONTEXT_SIZE:]).cuda()).detach().cpu().numpy()[0])
    #  predict /= sum(predict)
    
    selected_word = np.random.choice(a=list(range(len(word2index))), p=predict)    
    prepared_text.append(index2word[selected_word])
#     prepared_text.append(index2word[predict.argmax()])
    
    print("Генерация:", " ".join(prepared_text[CONTEXT_SIZE - 1:]))

Генерация: EOS meanwhile
Генерация: EOS meanwhile park
Генерация: EOS meanwhile park snow
Генерация: EOS meanwhile park snow UNK
Генерация: EOS meanwhile park snow UNK railway
Генерация: EOS meanwhile park snow UNK railway them
Генерация: EOS meanwhile park snow UNK railway them its
Генерация: EOS meanwhile park snow UNK railway them its full
Генерация: EOS meanwhile park snow UNK railway them its full for
Генерация: EOS meanwhile park snow UNK railway them its full for countless
Генерация: EOS meanwhile park snow UNK railway them its full for countless agricultural
Генерация: EOS meanwhile park snow UNK railway them its full for countless agricultural only
Генерация: EOS meanwhile park snow UNK railway them its full for countless agricultural only entrance
Генерация: EOS meanwhile park snow UNK railway them its full for countless agricultural only entrance features
Генерация: EOS meanwhile park snow UNK railway them its full for countless agricultural only entrance features with


In [104]:
embeddings_values = next(mlp_model.embeddings.parameters())
embeddings_values = embeddings_values.cpu().detach().numpy()

In [105]:
from scipy.spatial.distance import cdist

def dict_k_closest(M, term_dict, inverse_term_dict, k=5):
    """
        :param M -- матрица векторых представлений
        :param term_dict -- слово2индекс
        :param inverse_term_dict -- индекс2слово
        :param k -- число ближайших соседей для выдачи
    """
    
    print("Computing all distances... (takes some time)")    
    distances = cdist(M, M, "cosine")
    sorted_by_dist_k = np.argsort(distances, axis=1)[:, :k]
    
    results = {}
    
    for term in term_dict:
        row_id = term_dict[term]
        similar = [inverse_term_dict[i] for i in sorted_by_dist_k[row_id, :]]
        results[term] = similar   
        
    return results

Находим для каждого слова по 5 ближайших соседей

In [106]:
results = dict_k_closest(embeddings_values, word2index, index2word, k=5)

Computing all distances... (takes some time)


In [107]:
results["love"], results["death"], results["peace"], results["world"]

(['love', 'loose', 'put', 'counter', 'growth'],
 ['death', 'nordston', 'away', 'discussion', 'growth'],
 ['peace', 'fully', 'meadows', 'spectators', 'discuss'],
 ['world', 'circle', 'motor', 'opposite', 'wandered'])

## Пустим в ход рекуррентные нейронные сети 

In [108]:
class NGramRNNLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NGramRNNLanguageModeler, self).__init__()
        
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, 
                                       embedding_dim=embedding_dim)
        
        # todo: consider changing out dims
        self.rnn = nn.LSTM(batch_first=True,
                           input_size=embedding_dim, 
                           hidden_size=hidden_dim)      
        
        self.dropout = nn.Dropout(p=0.8)
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.activation = nn.Tanh() 
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)       

    def forward(self, inputs):
        # по сути конкатенация эмбеддингов
        embeds = self.embeddings(inputs)   
        
        _, (rnn_state_t, rnn_state_c) = self.rnn(embeds)
        
        rnn_state_t = torch.squeeze(rnn_state_t)
        
        # выбрасываем часть значений
        embeds = self.dropout(rnn_state_t)
        
        # линейная трансформация
        out = self.linear1(embeds)
        
        # применяем нелинейность к слою
        out = self.activation(out)
        
        # применяем второй линейный слой
        out = self.linear2(out)
        
        # применяем софтмакс
        log_probs = self.log_softmax(out)
        
        return log_probs

In [109]:
import torch
from torch import optim

losses = []
loss_function = nn.NLLLoss()
vocab_size = len(word2index)

EMBEDDING_DIM = 45
HIDDEN_DIM = 300
CONTEXT_SIZE = 10

rnn_model = NGramRNNLanguageModeler(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM).cuda()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.01)

rnn_model, optimizer

(NGramRNNLanguageModeler(
   (embeddings): Embedding(3710, 45)
   (rnn): LSTM(45, 300, batch_first=True)
   (dropout): Dropout(p=0.8)
   (linear1): Linear(in_features=300, out_features=300, bias=True)
   (activation): Tanh()
   (linear2): Linear(in_features=300, out_features=3710, bias=True)
   (log_softmax): LogSoftmax()
 ), Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 0.01
     weight_decay: 0
 ))

In [110]:
len(batches), len(batches[0]), len(batches[0][0]), len(batches[0][0][0])

(36, 2, 1024, 10)

In [114]:
rnn_model, losses = train_routine(loss_function=loss_function, batches=batches, 
                                  epochs=EPOCHS, model=rnn_model)

In [115]:
"Perplexity of RNN model on train set", compute_ppl(model=rnn_model, nllloss=loss_function,test_batches=batches)

('Perplexity of RNN model on train set', 9.625220291291766)

In [116]:
"Perplexity of RNN model on test set", compute_ppl(model=rnn_model, nllloss=loss_function,test_batches=test_batches)

('Perplexity of RNN model on test set', 112.4898039641727)

## Задание: улучшить результат работы нейронных моделей
----
Можно добавлять слои, выкручивать регуляризацию, ...

## Задание*: инициализировать embedding-слой google-овскими word2vec-ами

...и снова проделать всё то же.

# Большая задача: kaggle

1. самостоятельно подготовить данные для обучения
2. обучить модель по train.tsv
3. сравнить два текста; какой из них естественнее, тот и молодец