## Project 2: N-gram, Neural N-gram, and LSTM Language Models

In this project, we implement a 3 types of language models: statistical n-gram model, a neural n-gram model, and a RNN LSTM model. We work with the WikiText2 dataset.

In [None]:
from collections import defaultdict, Counter
import numpy as np
import math
import tqdm
import random
import pdb
import torch
from torch import nn
import torch.nn.functional as F
import torchtext.legacy as torchtext

# download and load the data
text_field = torchtext.data.Field()
train_dataset, validation_dataset, test_dataset = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
text_field.build_vocab(train_dataset, validation_dataset, test_dataset)
train_text = train_dataset.examples[0].text
validation_text = validation_dataset.examples[0].text

vocab = text_field.vocab
vocab_size = len(vocab)

We implement a function to decode the outputs of a language model into text, one token at a time.

In [None]:
def generate_text(model, n=20, prefix=('<eos>', '<eos>')):
    prefix = list(prefix)
    for _ in range(n):
        probs = model.next_word_probabilities(prefix)
        word = random.choices(vocab.itos, probs)[0]
        prefix.append(word)
    return ' '.join(prefix)

### Backoff N-gram Model with Kneser-Ney unigrams

Our goal is to implement a back-off n-gram model that uses a Kneser-Ney model in place of a statistical unigram model.

We first implement the base n-gram model with alpha-smoothing, according to the conditional distribution: $P(w_2|w_1)=\frac{C(w_1,w_2)+\alpha}{C(w_1)+\alpha |V|}$, where $|V|$ is the size of the vocabulary and $C(w_1,\ldots,w_n)$ is the counts of the input n-gram.

In [None]:
class NGramModel:
    def __init__(self, train_text, n=2, alpha=3e-3):
        self.n = n
        self.smoothing = alpha
        self.vocab_size = vocab_size
        self.length = len(train_text)
        self.counts_denom = defaultdict(int)
        self.counts_numer = defaultdict(int)

        # get the counts for the conditional distribution
        def get_counts():
            left, right = 0, self.n

            while right <= self.length:
                self.counts_numer[' '.join(train_text[left:right])] += 1
                self.counts_denom[' '.join(train_text[left:right-1])] += 1
                left += 1
                right += 1

        if self.n > 1:
            get_counts()
        else:
            self.counts_numer = Counter(train_text)

    def n_gram_probability(self, n_gram):
        """Return the probability of the last word in an n-gram for decoding.
        
        n_gram: a list of strings (tokens)
        return: conditional probability of the last token
        """
        assert len(n_gram) == self.n

        numer = self.counts_numer[' '.join(n_gram)] + self.smoothing
        denom = self.counts_denom[' '.join(n_gram[:-1])] + self.smoothing*self.vocab_size
        if self.n > 1:
            numer = self.counts_numer[' '.join(n_gram)] + self.smoothing
            denom = self.counts_denom[' '.join(n_gram[:-1])] + self.smoothing*self.vocab_size
        else:
            numer = self.counts_numer[n_gram[0]]
            denom = self.length
        
        return numer/denom

    def next_word_probabilities(self, text_prefix):
        """Return a list of probabilities for each word in the vocabulary based on context.
        
        text_prefix: a list of strings (tokens)
        return: list of float probabilities for each word in the vocabulary
        """

        if (len(text_prefix) >= self.n-1) and (self.n > 1):
            prob = [self.n_gram_probability(text_prefix[-self.n+1:]+[word]) for word in vocab.itos]
        elif self.n  == 1:
            prob = [self.n_gram_probability([word]) for word in vocab.itos]

        # special case if the prefix doesn't have at least n-1 words
        # outputs uniform distribution over the vocab
        else:
            prob = [1/self.vocab_size]*self.vocab_size

        return prob

    def perplexity(self, full_text):
        """Computes perplexity of model

        full_text: list of strings (tokens)
        return: perplexity as a float
        """

        minus_log_prob = 0
        for i, _ in enumerate(full_text):
            # uniform probability over the vocab size during the first n-1 words
            if i < self.n - 1:
                minus_log_prob += math.log(self.vocab_size)
            else:
                minus_log_prob += -math.log(self.n_gram_probability(full_text[i-self.n+1:i+1]))

        perp = math.exp(minus_log_prob/len(full_text))

        return perp

We now implement the back-off mechanism, along with a discount $\delta$ for smoothing. The probability distributions are given by:
$$
\begin{align}
P\left(w_i|w_{i-n+1}^{i-1}\right)&=\frac{max\left\{C(w_{i-n+1}^i)-\delta,0\right\}}{\sum_{w_i} C\left(w_{i-n+1}^i\right)} + \alpha\left(w_{i-n+1}^{i-1}\right) P\left(w_i|w_{i-n+2}^{i-1}\right), \\
\alpha\left(w_{i-n+1}^{i-1}\right)&=\frac{\delta N_{1+}\left(w_{i-n+1}^{i-1}\right)}{{\sum_{w_i} C\left(w_{i-n+1}^i\right)}},
\end{align}
$$
where $N_{1+}$ is the number of possible n-grams that occur given the previous $n-1$ words.  If there are no n-grams for a given context e.g. $\sum_{w_i} C(w_{i-n+1}^i)=0$, then the model backs off to a lower order model.

In [None]:
class DiscountBackoffModel(NGramModel):
    def __init__(self, train_text, lower_order_model, n=2, delta=0.9):
        super().__init__(train_text, n=n)
        self.lower_order_model = lower_order_model
        self.discount = delta
        self.n = n
        self.vocab_size = vocab_size
        self.length = len(train_text)

        # dictionaries for the numerator of the first term, the N1+ function, and the denominator
        self.counts_denom = defaultdict(int)
        self.counts_numer = defaultdict(int)
        self.N1 = defaultdict(int)

        # fill in the counts and N1 dictionaries
        def discount_counts():
            left, right = 0, self.n

            while right <= self.length:
                gram = ' '.join(train_text[left:right])
                gram_minus = ' '.join(train_text[left:right-1])
                
                # increment counters for the given n-gram and the n-1-gram behind the last word
                self.counts_numer[gram] += 1
                self.counts_denom[gram_minus] += 1

                # add an element to N1 if the n-gram hasn't been seen before
                if self.counts_numer[gram] == 1:
                    self.N1[gram_minus] += 1
                left += 1
                right += 1

        discount_counts()
        # END SOLUTION

    def n_gram_probability(self, n_gram):
        assert len(n_gram) == self.n

        gram = ' '.join(n_gram)
        gram_minus = ' '.join(n_gram[:-1])
        denom = self.counts_denom[gram_minus]

        # back off
        if denom == 0:
            return self.lower_order_model.n_gram_probability(n_gram[1:])
        else:
            numer1 = max(self.counts_numer[gram]-self.discount, 0)
            numer2 = self.discount*self.N1[gram_minus]*self.lower_order_model.n_gram_probability(n_gram[1:])
            return (numer1+numer2)/denom



We can consider the performance of the bigram and trigram models before implementing Kneser-Ney:

In [None]:
bigram_backoff_model = DiscountBackoffModel(train_text, unigram_model, 2)
trigram_backoff_model = DiscountBackoffModel(train_text, bigram_backoff_model, 3)
print('bigram backoff validation perplexity:', bigram_backoff_model.perplexity(validation_text))
print('trigram backoff validation perplexity:', trigram_backoff_model.perplexity(validation_text))

bigram backoff validation perplexity: 303.6883652598457
trigram backoff validation perplexity: 271.1216732931965


Finally, we implement Kneser-Ney distribution to replace the statistical unigram model to take into account novelty: $P(w)\propto |\{w':c(w',w) > 0\}|$

In [None]:
class KneserNeyBaseModel(NGramModel):
    def __init__(self, train_text):
        super().__init__(train_text, n=1)
        self.length = len(train_text)
        self.cont_count = defaultdict(int)
        self.contexts = {}
        self.denom = sum(self.cont_count.values())

        def unique_counts():
            left, right = 0, 2

            while right <= self.length:
                bigram = ' '.join(train_text[left:right])
                if bigram not in self.contexts:
                    self.cont_count[train_text[right-1]] += 1
                    
                self.contexts[bigram] = 1
                # if the bigram is a unique context for train_text[right-1],
                #  we increment the counter for train_text[right-1]
                # i.e. cont_count maps words to the number of unique bigrams for which
                #  the word is the second term in the bigram
                left += 1
                right += 1

        unique_counts()
        
    def n_gram_probability(self, n_gram):
        assert len(n_gram) == 1

        prob = self.cont_count[n_gram[0]]/self.denom

        return prob

We see that the Kneser-Ney unigram model improves on the statistical unigram model:

In [None]:
kn_base = KneserNeyBaseModel(train_text)
bigram_kn_backoff_model = DiscountBackoffModel(train_text, kn_base, 2)
print('bigram Kneser-Ney backoff validation perplexity:', bigram_kn_backoff_model.perplexity(validation_text))
print('trigram Kneser-Ney backoff validation perplexity:', trigram_kn_backoff_model.perplexity(validation_text))

bigram Kneser-Ney backoff validation perplexity: 289.05832933714487
trigram Kneser-Ney backoff validation perplexity: 256.61334337297114


### Neural N-gram Model

We now implement a neural n-gram model, using an embedding layer, 2 fully connected layers, and an output layer with weights tied to the embedding layer. We use standard relu activation and dropout with $p=0.1$ after the fully connected layers.

In [None]:
def ids(tokens):
    return [vocab.stoi[t] for t in tokens]

We construct a windowed dataset, with batches containing the n-1 context.

In [None]:
class NeuralNgramDataset(torch.utils.data.Dataset):
    def __init__(self, text_token_ids, n):
        self.text_token_ids = text_token_ids
        self.n = n

    def __len__(self):
        return len(self.text_token_ids)

    def __getitem__(self, i):
        if i < self.n-1:
            prev_token_ids = [vocab.stoi['<eos>']] * (self.n-i-1) + self.text_token_ids[:i]
        else:
            prev_token_ids = self.text_token_ids[i-self.n+1:i]

        assert len(prev_token_ids) == self.n-1

        x = torch.tensor(prev_token_ids)
        y = torch.tensor(self.text_token_ids[i])
        return x, y

In [None]:
class NeuralNGramNetwork(nn.Module):
    # the base model 
    def __init__(self, n):
        super().__init__()
        self.n = n
        self.vocab_size = vocab_size
        self.linear1 = nn.Linear((self.n-1)*128, 1024)
        self.linear2 = nn.Linear(1024, 1024)
        self.linear3 = nn.Linear(1024, 128)
        self.out = nn.Linear(128, self.vocab_size)

    def forward(self, x):
        # x is a tensor with shape (batch, n-1)
        # returns a tensor of logits with shape (batch, vocab_size)
        output = F.embedding(x, weight=self.out.weight).flatten(1,-1)
        output = self.linear1(output)
        output = F.relu(self.linear2(output))
        output = F.dropout(output, p=0.1, training=self.training)
        output = self.linear3(output)
        output = self.out(output)

        return output


class NeuralNGramModel:
    # wrapper NeuralNGramNetwork to handle training and evaluation
    def __init__(self, n):
        self.n = n
        self.vocab_size = vocab_size
        self.network = NeuralNGramNetwork(n).cuda()

    def train(self):
        dataset = NeuralNgramDataset(ids(train_text), self.n)
        train_loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)
        # iterating over train_loader has two outputs:
        # 1) previous token ids with size (batch, n-1),
        # 2) current token id with size (batch, )

        optimizer = torch.optim.Adam(self.network.parameters())

        for epoch in range(10):
            print('Epoch', epoch+1)
            for last_token, current_token in tqdm.notebook.tqdm(train_loader, leave=True):
                optimizer.zero_grad()
                last_token, current_token = last_token.cuda(), current_token.cuda()
                loss = F.cross_entropy(self.network(last_token), current_token)
                loss.backward()
                optimizer.step()

            perplexity = self.perplexity(validation_text)
            print('Perplexity:', perplexity)

            # early stopping
            if (epoch == 0) or (perplexity < best_perplexity):
                best_perplexity = perplexity
                torch.save(self.network.state_dict(), 'network.pt')

        self.network.load_state_dict(torch.load('network.pt'))

        return self.network

    def next_word_probabilities(self, text_prefix):
        self.network.eval()

        prefix = torch.tensor(ids(text_prefix)).cuda()

        if (len(text_prefix) >= self.n-1):
            cut_prefix = prefix[-self.n+1:].unsqueeze(0)
            prob = torch.exp(F.log_softmax(self.network(cut_prefix), dim=1).squeeze(0))
        else:
            prob = [1/self.vocab_size]*self.vocab_size

        self.network.train()

        return prob

    def perplexity(self, text):
        self.network.eval()

        # special case of insufficient text size: uniform distribution over vocabulary
        if len(text) < self.n:
            minus_log_prob = [math.log(self.vocab_size)]*len(text)

        else:
            minus_log_prob = [math.log(self.vocab_size)]*(self.n-1)
            text_dataset = NeuralNgramDataset(ids(text), self.n)
            text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=128, shuffle=False)
            
            too_few_prefixes = self.n-1
            for last_token, current_token in tqdm.notebook.tqdm(text_loader, leave=False):
                last_token, current_token = last_token.cuda(), current_token.cuda()
                prob = F.log_softmax(self.network(last_token), dim=1)

                for i, word in enumerate(current_token):
                    # skipping if there are too few prefix tokens
                    if too_few_prefixes > 0:
                         too_few_prefixes -= 1
                    else:
                        minus_log_prob.append(-prob[i,word].item())

        perp = math.exp(np.mean(minus_log_prob))

        self.network.train()

        return perp


In [None]:
neural_trigram_model.train()
print('neural trigram validation perplexity:', neural_trigram_model.perplexity(validation_text))

Epoch 1


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 250.58473994052122
Epoch 2


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 228.15833242317862
Epoch 3


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 224.61044309042225
Epoch 4


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 223.40162892280938
Epoch 5


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 227.50341709204972
Epoch 6


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 229.34594261809286
Epoch 7


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 230.95627659710885
Epoch 8


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 234.80468058070423
Epoch 9


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 239.21947274498058
Epoch 10


  0%|          | 0/16318 [00:00<?, ?it/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Perplexity: 244.36282568099875


  0%|          | 0/1701 [00:00<?, ?it/s]

neural trigram validation perplexity: 223.40162892280938


### Recurrent LSTM Model

Finally, we implement a recurrent LSTM language model. We use an embedding layer, 3 LSTM layers with 1024 units with dropout with $p=0.5$ after each LSTM layer (including the final layer), followed by a fully connected layer projecting down to 128, and then the output. The embedding layer has weights tied to the output layer.

We also use a learning rate scheduler with a patience of 3 epochs and threshold of 1e-3 in perplexity.

In [None]:
def ids(tokens):
    return [vocab.stoi[t] for t in tokens]

class LSTMNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.vocab_size = vocab_size

        self.embed = nn.Embedding(self.vocab_size, 128)
        self.lstm = nn.LSTM(128, 1024, num_layers=3, dropout=0.5)
        self.drop = nn.Dropout(0.5)
        self.linear = nn.Linear(1024, 128)
        self.out = nn.Linear(128, self.vocab_size)


    def forward(self, x, state):

        self.embed.weight = self.out.weight
        embeds = self.embed(x)
        lstm_out = self.lstm(embeds, state)
        output = self.linear(self.drop(lstm_out[0]))
        output = self.out(output)
        return (output, lstm_out[1])


class LSTMModel:
    def __init__(self, load=False, model=None):
      self.network = LSTMNetwork().cuda()

      if load:
        self.network.load_state_dict(model)

    def train(self, epochs=20):
        batch_size = 64
        train_iterator = torchtext.data.BPTTIterator(train_dataset, batch_size=batch_size, 
                                                     bptt_len=32, device='cuda')
        
        optimizer = torch.optim.Adam(self.network.parameters())
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold = 1e-3)

        # initialize the hidden states to 0s
        state = (torch.zeros(3, batch_size, 1024).cuda(),torch.zeros(3, batch_size, 1024).cuda())
        for epoch in range(epochs):
            print('Epoch', epoch+1)
            for batch in tqdm.notebook.tqdm(train_iterator, leave=True):
                optimizer.zero_grad()
                text, target = batch.text.cuda(), batch.target.cuda()
                output, state = self.network(text, state)
                loss = F.cross_entropy(output.permute(1,2,0), target.permute(1,0))
                loss.backward()
                optimizer.step()
                # detach the gradients to prevent propagation through batches
                state = (state[0].detach(), state[1].detach())
            
            perplexity = self.dataset_perplexity(validation_dataset)
            print('Perplexity:', perplexity)
            print('Training loss:', math.exp(loss.item()))

            # early stopping
            if (epoch == 0) or (perplexity < best_score):
                best_score = perplexity
                torch.save(self.network.state_dict(), 'network.pt')

            scheduler.step(perplexity)

        self.network.load_state_dict(torch.load('network.pt'))

        return self.network


    def next_word_probabilities(self, text_prefix):
        "Return a list of probabilities for each word in the vocabulary."

        prefix_token_tensor = torch.tensor(ids(text_prefix), device='cuda').view(-1, 1)
        
        self.network.eval()

        state = (torch.zeros(3, 1, 1024).cuda(), torch.zeros(3, 1, 1024).cuda())
        output = self.network(prefix_token_tensor, state)[0]
        prob = F.softmax(output[-1].squeeze(0), dim=-1).tolist()

        self.network.train()

        return prob

    def dataset_perplexity(self, torchtext_dataset):
        "Return perplexity as a float."

        batch_size = 64
        iterator = torchtext.data.BPTTIterator(torchtext_dataset, batch_size=batch_size, bptt_len=32, device='cuda')

        self.network.eval()

        with torch.no_grad():
          minus_log_prob = []

          state = (torch.zeros(3, batch_size, 1024).cuda(), torch.zeros(3, batch_size, 1024).cuda())
          for batch in tqdm.notebook.tqdm(iterator, leave=False):
              text, target = batch.text.cuda(), batch.target.cuda()
              output, state = self.network(text, state)
              prob = F.log_softmax(output, dim=-1)
              minus_log_prob += prob.gather(-1, target.unsqueeze(-1)).flatten().tolist()
                      
          perp = math.exp(-np.mean(minus_log_prob))

        self.network.train()

        return perp

In [None]:
lstm_model = LSTMModel()
lstm_model.train(40)

Epoch 1


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 358.0128510925963
Training loss: 454.17196960765517
Epoch 2


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 228.77902270657353
Training loss: 242.053003045056
Epoch 3


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 185.60935631171316
Training loss: 169.30703930993525
Epoch 4


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 218.42305949223964
Training loss: 142.33036416101191
Epoch 5


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 147.74017680688468
Training loss: 117.38269686174883
Epoch 6


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 142.61106017159167
Training loss: 105.73458765148047
Epoch 7


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 134.72457633775392
Training loss: 93.77187251430855
Epoch 8


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 130.15093369026752
Training loss: 83.5751678673498
Epoch 9


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 126.92086240489795
Training loss: 76.94750498973619
Epoch 10


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 126.25255592018821
Training loss: 70.56007547491296
Epoch 11


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 125.20122262125166
Training loss: 65.18493935584166
Epoch 12


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 129.9280548548506
Training loss: 63.34021716504811
Epoch 13


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 144.37561644389217
Training loss: 57.92307787011203
Epoch 14


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 148.028276775523
Training loss: 56.51886683343916
Epoch 15


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 205.90778252836134
Training loss: 54.39486211093479
Epoch 16


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 200.08201363998342
Training loss: 50.82045597449899
Epoch 17


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 116.08534591597848
Training loss: 48.102707327474924
Epoch 18


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 138.61929851305447
Training loss: 47.05450811566344
Epoch 19


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 189.53265140120925
Training loss: 46.189035940646384
Epoch 20


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 116.52625148709875
Training loss: 46.34986396639424
Epoch 21


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 116.91101907419541
Training loss: 45.23788719293303
Epoch 22


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 114.83819940955175
Training loss: 48.014216984770236
Epoch 23


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.99156984235589
Training loss: 47.06698492941526
Epoch 24


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.96005443120836
Training loss: 48.15998107553035
Epoch 25


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 114.04585062524495
Training loss: 46.43658223078127
Epoch 26


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 114.01156949401826
Training loss: 44.923421565866285
Epoch 27


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 114.06059613100012
Training loss: 47.00377101222833
Epoch 28


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.94121085335235
Training loss: 46.6935479302573
Epoch 29


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.84335686364274
Training loss: 48.129849640448484
Epoch 30


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.7740630395164
Training loss: 46.18326584314371
Epoch 31


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.75452505503999
Training loss: 46.385792557719945
Epoch 32


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.72375431303928
Training loss: 47.72481887030745
Epoch 33


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.68332240942752
Training loss: 47.173396447635604
Epoch 34


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.65248664960605
Training loss: 47.49900262878888
Epoch 35


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.68552104426509
Training loss: 47.688671745698585
Epoch 36


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.63360130724362
Training loss: 47.40564352946273
Epoch 37


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.62951118431185
Training loss: 49.20722388276476
Epoch 38


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.62740248280845
Training loss: 47.57670788407537
Epoch 39


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.62335271048926
Training loss: 47.14344391507245
Epoch 40


  0%|          | 0/1020 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

Perplexity: 113.61598673970859
Training loss: 46.10051571120237


LSTMNetwork(
  (embed): Embedding(33279, 128)
  (lstm): LSTM(128, 1024, num_layers=3, dropout=0.5)
  (drop): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=1024, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=33279, bias=True)
)