Code adapted from main.py and generate.py at https://github.com/pytorch/examples/blob/master/word_language_model/.

In [0]:
# coding: utf-8
import argparse
import time
import math
import os
import torch
import torch.nn as nn
# import torch.onnx
import torch.nn.functional as F

path = "./drive/My Drive/wikitext-2"

emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value

# Set the random seed manually for reproducibility.
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
###############################################################################
# Load data
###############################################################################

import os
from io import open

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
        self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
        self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

corpus = Corpus(path)

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [0]:
###############################################################################
# Build the model
###############################################################################
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

ntokens = len(corpus.dictionary)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
criterion = nn.CrossEntropyLoss()

In [0]:
###############################################################################
# Training code
###############################################################################

bptt = 35
log_interval = 200

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            # if args.model == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        output = model(data)
        output = output.view(-1, ntokens)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [14]:
# Loop over epochs.
lr = 5.0
best_val_loss = None
epochs = 6

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("wiki_transformer.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("wiki_transformer.pt", 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |   200/ 2983 batches | lr 5.00 | ms/batch 741.02 | loss  8.14 | ppl  3426.31
| epoch   1 |   400/ 2983 batches | lr 5.00 | ms/batch 736.08 | loss  6.95 | ppl  1044.96
| epoch   1 |   600/ 2983 batches | lr 5.00 | ms/batch 735.11 | loss  6.53 | ppl   685.61
| epoch   1 |   800/ 2983 batches | lr 5.00 | ms/batch 737.20 | loss  6.36 | ppl   578.16
| epoch   1 |  1000/ 2983 batches | lr 5.00 | ms/batch 736.21 | loss  6.26 | ppl   525.74
| epoch   1 |  1200/ 2983 batches | lr 5.00 | ms/batch 736.23 | loss  6.23 | ppl   506.96
| epoch   1 |  1400/ 2983 batches | lr 5.00 | ms/batch 736.70 | loss  6.18 | ppl   480.89
| epoch   1 |  1600/ 2983 batches | lr 5.00 | ms/batch 736.53 | loss  6.19 | ppl   488.23
| epoch   1 |  1800/ 2983 batches | lr 5.00 | ms/batch 737.69 | loss  6.06 | ppl   429.80
| epoch   1 |  2000/ 2983 batches | lr 5.00 | ms/batch 742.18 | loss  6.07 | ppl   432.24
| epoch   1 |  2200/ 2983 batches | lr 5.00 | ms/batch 742.72 | loss  5.97 | ppl   391.43
| epoch   

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 2983 batches | lr 5.00 | ms/batch 774.89 | loss  5.90 | ppl   365.30
| epoch   2 |   400/ 2983 batches | lr 5.00 | ms/batch 793.57 | loss  5.86 | ppl   351.73
| epoch   2 |   600/ 2983 batches | lr 5.00 | ms/batch 778.24 | loss  5.69 | ppl   297.23
| epoch   2 |   800/ 2983 batches | lr 5.00 | ms/batch 782.14 | loss  5.71 | ppl   301.53
| epoch   2 |  1000/ 2983 batches | lr 5.00 | ms/batch 782.44 | loss  5.68 | ppl   294.12
| epoch   2 |  1200/ 2983 batches | lr 5.00 | ms/batch 790.57 | loss  5.71 | ppl   301.49
| epoch   2 |  1400/ 2983 batches | lr 5.00 | ms/batch 792.09 | loss  5.73 | ppl   306.47
| epoch   2 |  1600/ 2983 batches | lr 5.00 | ms/batch 791.42 | loss  5.76 | ppl   315.93
| epoch   2 |  1800/ 2983 batches | lr 5.00 | ms/batch 795.38 | loss  5.66 | ppl   286.11
| epoch   2 |  2000/ 2983 batches | lr 5.00 | ms/batch 794.51 | loss  5.70 | ppl   298.41
| epoch   2 |  2200/ 2983 batches | lr 5.00 | ms/batch 792.91 | loss  5.60 | ppl   270.88
| epoch   

In [18]:
###############################################################################
# Generate Text
###############################################################################
temperature = 1.0
nwords = 1000

def generate_text(model, temperature, nwords, out):
    model.eval()
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    
    with open(out, 'w') as outf:
        if nwords >= 1026583: outf.write(train_small + "\n")
        with torch.no_grad():  # no tracking history
            for i in range(nwords):
                # if is_transformer_model:
                output = model(input, False)
                word_weights = output[-1].squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                input = torch.cat([input, word_tensor], 0)

                word = corpus.dictionary.idx2word[word_idx]

                outf.write(word + ('\n' if i % 20 == 19 else ' '))

                if i % log_interval == 0:
                    print('| Generated {}/{} words'.format(i, nwords))

generate_text(model, temperature, nwords, "generated_whole.txt")

| Generated 0/1000 words
| Generated 200/1000 words
| Generated 400/1000 words
| Generated 600/1000 words
| Generated 800/1000 words


Experiment: Augment Data with Generated Text
-------------------------------------

Train a model using only the first half of the original training set (300 articles, 1 million words), and select the best model as before. Use the best model to generate half the training set's worth of data, and run a model on a training set of half the original and half generated text.

In [0]:
train_str = ""
with open("./drive/My Drive/wikitext-2/wiki.train.tokens", encoding="utf-8", errors='ignore') as f:
    for line in f:
        train_str += line

val_str = ""
with open("./drive/My Drive/wikitext-2/wiki.valid.tokens", encoding="utf-8", errors='ignore') as f:
    for line in f:
        val_str += line
        
test_str = ""
with open("./drive/My Drive/wikitext-2/wiki.test.tokens", encoding="utf-8", errors='ignore') as f:
    for line in f:
        test_str += line

Check that the halved dataset contains about half the articles, words, and chars. The same applies for val and test, though those numbers are not shown.

In [23]:
train_small = train_str[:int(len(train_str) / 2) - 38]
val_small = val_str[:int(len(val_str) / 2) + 1422]
test_small = test_str[:int(len(test_str) / 2) + 869]

with open("./drive/My Drive/wikitext-2/half_wiki.train.tokens", "a") as f:
    f.write(train_small)
with open("./drive/My Drive/wikitext-2/half_wiki.val.tokens", "a") as f:
    f.write(val_small)
with open("./drive/My Drive/wikitext-2/half_wiki.test.tokens", "a") as f:
    f.write(test_small)

print("Number of article-like units in training: ", len(train_str.split("\n \n \n")))
print("Number of article-like units in halved training: ", len(train_small.split("\n \n \n")))

print("Number of word-like units in training: ", len(train_str.split()))
print("Number of word-like units in halved training: ", len(train_small.split()))

print("Number of chars units in training: ", len(train_str))
print("Number of chars units in halved training: ", len(train_small))

Number of article-like units in training:  1191
Number of article-like units in halved training:  616
Number of word-like units in training:  2051910
Number of word-like units in halved training:  1026583
Number of chars units in training:  10780437
Number of chars units in halved training:  5390180


Train model with half training data.

In [27]:
class Corpus(object):
    def __init__(self, path, train_name, val_name, test_name):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path + train_name)
        self.valid = self.tokenize(path + val_name)
        self.test = self.tokenize(path + test_name)

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids
        
corpus = Corpus(path, "half_wiki.train.tokens", "half_wiki.val.tokens", "half_wiki.test.tokens")

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

ntokens = len(corpus.dictionary)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
lr = 1.25
best_val_loss = None

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("wiki_transformer_half.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("wiki_transformer_half.pt", 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

./drive/My Drive/wikitext-2/half_wiki.train.tokens
./drive/My Drive/wikitext-2/half_wiki.val.tokens
./drive/My Drive/wikitext-2/half_wiki.test.tokens
| epoch   1 |   200/ 1492 batches | lr 1.25 | ms/batch 652.82 | loss  7.52 | ppl  1847.52
| epoch   1 |   400/ 1492 batches | lr 1.25 | ms/batch 651.70 | loss  6.82 | ppl   913.93
| epoch   1 |   600/ 1492 batches | lr 1.25 | ms/batch 668.99 | loss  6.58 | ppl   722.11
| epoch   1 |   800/ 1492 batches | lr 1.25 | ms/batch 648.97 | loss  6.49 | ppl   659.00
| epoch   1 |  1000/ 1492 batches | lr 1.25 | ms/batch 647.90 | loss  6.41 | ppl   610.48
| epoch   1 |  1200/ 1492 batches | lr 1.25 | ms/batch 647.53 | loss  6.34 | ppl   567.52
| epoch   1 |  1400/ 1492 batches | lr 1.25 | ms/batch 651.65 | loss  6.24 | ppl   511.51
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 1011.22s | valid loss  6.06 | valid ppl   429.34
------------------------------------------------------

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 1492 batches | lr 1.25 | ms/batch 655.15 | loss  6.15 | ppl   469.77
| epoch   2 |   400/ 1492 batches | lr 1.25 | ms/batch 650.35 | loss  6.04 | ppl   421.61
| epoch   2 |   600/ 1492 batches | lr 1.25 | ms/batch 651.47 | loss  5.95 | ppl   382.84
| epoch   2 |   800/ 1492 batches | lr 1.25 | ms/batch 651.32 | loss  5.93 | ppl   375.08
| epoch   2 |  1000/ 1492 batches | lr 1.25 | ms/batch 649.93 | loss  5.91 | ppl   368.27
| epoch   2 |  1200/ 1492 batches | lr 1.25 | ms/batch 650.74 | loss  5.93 | ppl   374.48
| epoch   2 |  1400/ 1492 batches | lr 1.25 | ms/batch 650.03 | loss  5.85 | ppl   345.65
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 1008.40s | valid loss  5.81 | valid ppl   335.27
-----------------------------------------------------------------------------------------
| epoch   3 |   200/ 1492 batches | lr 1.25 | ms/batch 652.59 | loss  5.80 | ppl   329.12
| epoch   3 |   400/ 149

In [40]:
nwords = 1026583
log_interval = 10000

def generate_text(model, temperature, nwords, out):
    model.eval()
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    
    text = ""
    with torch.no_grad():  # no tracking history
        for i in range(nwords):
            output = model(input, False)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            input = torch.cat([input, word_tensor], 0)

            word = corpus.dictionary.idx2word[word_idx]
            text += word + ' '

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, nwords))
    
    with open(out, 'w') as f:
        if nwords >= 1026583: f.write(train_small + "\n")
        f.write(text)

generate_text(model, temperature, nwords, "./drive/My Drive/wikitext-2/generated_half_train.txt")

| Generated 0/1026583 words


KeyboardInterrupt: ignored

After 3 hours, I stopped this cell. Unfortunately, I don't know how to speed up the text generation, so I was unable to test my data augmentation idea.

Run another model with half original dataset, half generated text.

In [0]:
corpus = Corpus(path, "generated_half_train.txt", "wiki.valid.tokens", "wiki.test.tokens")

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

ntokens = len(corpus.dictionary)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
lr = 1.25
best_val_loss = None

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("wiki_transformer_half.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("wiki_transformer_half.pt", 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [0]:
nwords = 1000
log_interval = 200
generate_text(model, temperature, nwords, "./drive/My Drive/wikitext-2/gen_half.txt")

Since the model trained on only half the training data generated similar results to the model utilizing the full dataset, we run the experiment again with even less training data.

This time, we try using only the first 5% of the original training set (30 articles, 100,000 words -- about twice the length of *The Great Gatsby*). Using the first fifth does not eliminate the possibility that maybe the first half of the training set is just particularly informative by chance and the success is not due to the transformer architecture, but it cuts that probability in half. If this model performs poorly, we may guess that the model did not have enough training data to succeed. By keeping to the first half, it we avoid confounding our conclusion with the possibility that the second half of the training data was uninformative.

In [0]:
train_small = train_str[:int(len(test_str) / 20) - 229]
val_small = val_str[:int(len(train_str)/20)+71]
test_small = test_str[:int(len(val_str)/20)+2575]

with open("./drive/My Drive/wikitext-2/five_wiki.train.tokens", "a") as f:
    f.write(train_small)
with open("./drive/My Drive/wikitext-2/five_wiki.val.tokens", "a") as f:
    f.write(val_small)
with open("./drive/My Drive/wikitext-2/five_wiki.test.tokens", "a") as f:
    f.write(test_small)

In [14]:
corpus = Corpus(path, "five_wiki.train.tokens", "five_wiki.val.tokens", "five_wiki.test.tokens")

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

ntokens = len(corpus.dictionary)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
lr = 2
best_val_loss = None
epochs = 10

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("wiki_transformer_half.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("wiki_transformer_half.pt", 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 23.76s | valid loss  7.85 | valid ppl  2573.33
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 23.72s | valid loss  7.12 | valid ppl  1240.80
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 23.73s | valid loss  7.20 | valid ppl  1339.31
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 23.95s | valid loss  6.81 | valid ppl   905.74
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   5 | time: 23.85s | valid loss  6.80 | valid ppl   896.33
--------------------------------------------------------------------------

In [16]:
temperature = 1.0
nwords = 1000
log_interval = 100
def generate_text(model, temperature, nwords, out):
    model.eval()
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    
    with open(out, 'w') as outf:
        if nwords >= 1026583: outf.write(train_small + "\n")
        with torch.no_grad():  # no tracking history
            for i in range(nwords):
                # if is_transformer_model:
                output = model(input, False)
                word_weights = output[-1].squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                input = torch.cat([input, word_tensor], 0)

                word = corpus.dictionary.idx2word[word_idx]

                outf.write(word + ('\n' if i % 20 == 19 else ' '))

                if i % log_interval == 0:
                    print('| Generated {}/{} words'.format(i, nwords))
generate_text(model, temperature, nwords, "./drive/My Drive/wikitext-2/generated_five_train.txt")

| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words


Run model trained on 5% original training data, 95% generated text:

In [0]:
corpus = Corpus(path, "generated_five_train.txt", "wiki.valid.tokens", "wiki.test.tokens")

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

ntokens = len(corpus.dictionary)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
lr = 1.25
best_val_loss = None

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open("wiki_transformer_half.pt", 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open("wiki_transformer_half.pt", 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [0]:
nwords = 1000
log_interval = 200
generate_text(model, temperature, nwords, "./drive/My Drive/wikitext-2/gen_five.txt")