In [1]:
import os

import numpy as np
import torch
import torch.nn as nn
import torchtext

In [2]:
USE_CUDA = torch.cuda.is_available()
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000
device = torch.device("cuda" if USE_CUDA else "cpu")

In [3]:
TEXT = torchtext.data.Field(lower=True)
text8 = torchtext.datasets.LanguageModelingDataset.splits(
    path="./data",
    train="text8.train.txt",
    validation="text8.dev.txt",
    test="text8.test.txt",
    text_field=TEXT,
)



In [5]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
print("vocabulary size: {}".format(len(TEXT.vocab)))

vocabulary size: 50002


In [6]:
train, val, test = text8

VOCAB_SIZE = len(TEXT.vocab)
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=device, bptt_len=32, repeat=False
)



In [7]:
# show some data
it = iter(train_iter)
for i in range(2):
    batch = next(it)
    print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))
    print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:, 2].data]))

five two heaven s in your backyard a film mort liddy wrote the score using a bastardized version of halley s fourth concerto it is mentioned in section one six one john
two heaven s in your backyard a film mort liddy wrote the score using a bastardized version of halley s fourth concerto it is mentioned in section one six one john galt
galt legends since everyone across the country is asking who is john galt it is not surprising that some people have come up with answers a number of john galt legends are
legends since everyone across the country is asking who is john galt it is not surprising that some people have come up with answers a number of john galt legends are told




In [34]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, nvocab, ninp, nhid, nlayers, dropout=0.5):
        super().__init__()
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(nvocab, ninp)

        if rnn_type == "LSTM":
            self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        elif rnn_type == "GRU":
            self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout)
        elif rnn_type in ["RNN_TANH", "RNN_RELU"]:
            nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type]
            self.rnn = nn.RNN(
                ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout
            )
        else:
            raise ValueError(
                "rnn_type should be in ['LSTM', 'GRU', 'RNN_TANH', RNN_RELU']"
            )

        self.decoder = nn.Linear(nhid, nvocab)

        self.init_weight()

    def init_weight(self):
        init_range = 0.1
        self.encoder.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)

    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        if self.rnn_type == "LSTM":
            return (
                weight.new_zeros(
                    (self.nlayers, bsz, self.nhid), requires_grad=requires_grad
                ),
                weight.new_zeros(
                    (self.nlayers, bsz, self.nhid), requires_grad=requires_grad
                ),
            )
        else:
            return weight.new_zeros(
                (self.nlayers, bsz, self.nhid), requires_grad=requires_grad
            )

    def forward(self, x, hidden):
        # encoder
        emb = self.drop(self.encoder(x))
        # rnn
        output, hidden = self.rnn(emb, hidden)
        # decoder
        decoded = self.decoder(output.view(-1, output.shape[2]))

        return decoded.view(output.shape[0], output.shape[1], decoded.shape[1]), hidden

In [35]:
def hidden_detach(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return [hidden_detach(v) for v in h]

In [36]:
def evaluate(model, data, batch_size, loss_fn):
    model.eval()
    total_loss = 0
    total_cnt = 0
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(batch_size, requires_grad=False)
        for i, batch in enumerate(it):
            text, target = batch.text.to(device), batch.target.to(device)

            output, hidden = model(text, hidden)

            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_cnt += np.multiply(*text.size())
            total_loss += loss.item() * np.multiply(*text.size())

    loss = total_loss / total_cnt
    model.train()
    return loss

In [37]:
RNN_TYPE = "GRU"

In [38]:
model = RNNModel(RNN_TYPE, VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
model = model.to(device)

In [39]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [43]:
PATH = f"./saved_model/{RNN_TYPE}_net.pth"
if os.path.exists(PATH):
    model.load_state_dict(torch.load(PATH))

GRAD_CLIP = 1.0
EPOCH = 2

mini_loss = float("inf")

for epoch in range(EPOCH):
    model.train()
    it = iter(train_iter)

    for i, batch in enumerate(it):
        text, target = batch.text.to(device), batch.target.to(device)
        hidden = model.init_hidden(BATCH_SIZE)
        hidden = hidden_detach(hidden)

        model.zero_grad()
        output, hidden = model(text, hidden)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        if i % 100 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())

        if i % 500 == 0:
            val_loss = evaluate(model, val_iter, BATCH_SIZE, loss_fn)

            if val_loss < mini_loss:
                mini_loss = val_loss
                print("best model, val loss: ", val_loss)
                torch.save(model.state_dict(), PATH)
            else:
                scheduler.step()
                print("lr decay to", optimizer.param_groups[0]["lr"])

    val_loss = evaluate(model, val_iter, BATCH_SIZE, loss_fn)
    if val_loss < mini_loss:
        mini_loss = val_loss
        print("best model, val loss: ", val_loss)
        torch.save(model.state_dict(), PATH)

epoch 0 iter 0 loss 4.553329944610596
best model, val loss:  5.86520219747661
epoch 0 iter 100 loss 4.991318702697754
epoch 0 iter 200 loss 4.799288272857666
epoch 0 iter 300 loss 4.4984965324401855
epoch 0 iter 400 loss 4.74827241897583
epoch 0 iter 500 loss 5.167263031005859
lr decay to 3.125e-05
epoch 0 iter 600 loss 5.431086540222168
epoch 0 iter 700 loss 5.226358413696289
epoch 0 iter 800 loss 4.971348285675049
epoch 0 iter 900 loss 5.008023738861084
best model, val loss:  5.832428959033384
epoch 1 iter 0 loss 4.47780704498291
lr decay to 1.5625e-05
epoch 1 iter 100 loss 5.025078296661377
epoch 1 iter 200 loss 4.740190029144287
epoch 1 iter 300 loss 4.463899612426758
epoch 1 iter 400 loss 4.672721862792969
epoch 1 iter 500 loss 5.118111610412598
best model, val loss:  5.826461640470935
epoch 1 iter 600 loss 5.341027736663818
epoch 1 iter 700 loss 5.158143520355225
epoch 1 iter 800 loss 4.933821201324463
epoch 1 iter 900 loss 5.026307582855225
best model, val loss:  5.8231578887512

In [41]:
model.hidden

ModuleAttributeError: 'RNNModel' object has no attribute 'hidden'

In [44]:
best_model = RNNModel(
    RNN_TYPE, VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5
)
best_model = best_model.to(device)
best_model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [48]:
test_loss = evaluate(best_model, test_iter, BATCH_SIZE, loss_fn)
print("perplexity: ", np.exp(test_loss))

perplexity:  338.03785584443256


In [49]:
hidden = best_model.init_hidden(1)
gene = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):
    output, hidden = best_model(gene, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    gene.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

agave <unk> agave laticincta agave harmless agave upscale hurteri mm trel australis lava com which epirus transferred the region upper sea into a small health town to georgia the station it many of the plant lift persians bermuda mt plants actress into central area latitudes or wide basins a chain pronounced <unk> is called lock the ailanthus locomotive while ash is the fruit <unk> antiprotons species north of angola as many uses of permanent paleozoic civilizations and it is not the old name for anagram that see baja with almonds for understanding patting retrieved two five two five zero two


In [75]:
gene

tensor([[22972]], device='cuda:0')