In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

# USE_CUDA = torch.cuda.is_available()
USE_CUDA = False

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000

In [2]:
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path='./text8',
        train='text8.train.txt', validation='text8.dev.txt',
        test='text8.test.txt', text_field=TEXT)

In [3]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [4]:
len(TEXT.vocab)

50002

In [5]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

In [6]:
TEXT.vocab.stoi['<unk>']

0

In [7]:
device = torch.device('cuda' if USE_CUDA else 'cpu')

In [8]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
(train, val, test), batch_size=BATCH_SIZE, device=device, 
    bptt_len=50, repeat=False, shuffle=True)

In [9]:
it = iter(train_iter)
batch = next(it)

In [10]:
batch


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.LongTensor of size 50x32]
	[.target]:[torch.LongTensor of size 50x32]

In [11]:
batch.text

tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [   8,   34,  522,  ..., 5237,    3,   12],
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027]])

In [12]:
" ".join(TEXT.vocab.itos[i] for i in batch.text[:, 0].data.cpu())

'anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the'

In [13]:
" ".join(TEXT.vocab.itos[i] for i in batch.target[:, 0].data.cpu())

'originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization'

In [14]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        
    def forward(self, text, hidden):
        # text: seq_length * batch_size
        emb = self.embed(text) # seq_length * batch_size * embed_size
        output, hidden = self.lstm(emb, hidden)
        # output: seq_length * batch_size * hidden_size
        # hidden: (1 * batch_size * hidden_size, 1 * batch_size * hidden_size)
        out_vocab = self.decoder(output.view(-1, output.shape[2]))
        out_vocab = out_vocab.view(output.size(0), output.size(1), out_vocab.size(-1))
        return out_vocab, hidden

    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True),
                weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True))

In [15]:
model = RNNModel(vocab_size=len(TEXT.vocab), 
                 embed_size=EMBEDDING_SIZE, 
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    model = model.to(device)
model

RNNModel(
  (embed): Embedding(50002, 100)
  (lstm): LSTM(100, 100)
  (decoder): Linear(in_features=100, out_features=50002, bias=True)
)

In [16]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [17]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [18]:
VOCAB_SIZE = len(TEXT.vocab)

In [19]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) # batch_size * target_dim, batch_size
            total_loss = loss.item() * np.multiply(*data.size())
            total_count = np.multiply(*data.size())
    loss = total_loss / total_count
    model.train()
    return loss

In [20]:
NUM_EPOCHS = 2
GRAD_CLIP = 5.0

val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) # batch_size * target_dim, batch_size
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
#         if i % 3 == 0:
#             print('loss', loss.item())
            
        if i % 3 == 0:
            print('loss', loss.item())
            val_loss = evaluate(model, val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                torch.save(model.state_dict(), 'lm.pth')
                print('best model saved')
            else:
                scheduler.step()
            val_losses.append(val_loss)

loss 10.810354232788086


KeyboardInterrupt: 

In [26]:
best_model = RNNModel(vocab_size=len(TEXT.vocab), 
                 embed_size=EMBEDDING_SIZE, 
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    best_model = best_model.to(device)
best_model.load_state_dict(torch.load('lm.pth'))

<All keys matched successfully>

In [27]:
hidden = best_model.init_hidden(1)
input = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(20):
    output, hidden = best_model(input, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

irreverent oppositions reproducible bon review and arrival for had field mass we identity a are enacts shaw cobalt two every
