In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x5cba810>

In [2]:
## prepare training data

import glob

# special token
UNKNOWN_WORD = "<unknown>"
START_OF_SENTENCE = "<SOS>"
END_OF_SENTENCE = "<EOS>"

input_files = glob.glob("data/input/*.txt")

# view one source file as a sentence
def get_one_sentence(file):
    with open(file, "r", encoding="utf-8") as fr:
        sentence = [line[:-1] for line in fr.readlines()]
    return sentence

# training data (set of sentences)
train_data = [[START_OF_SENTENCE] + get_one_sentence(input_file) for input_file in input_files]

# target values (next words)
train_target = [sentence[1:] + [END_OF_SENTENCE] for sentence in train_data]

def get_word_to_idx_dict(sentences):
    word_to_idx = {}
    for sentence in sentences:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)
    return word_to_idx

# get word to index dictionary
word_to_idx = get_word_to_idx_dict(train_data)            
target_to_idx = get_word_to_idx_dict(train_target)    

# prepare for unknown word input
word_to_idx[UNKNOWN_WORD] = len(word_to_idx)

# target word list (idx -> word)
target_word_list = [None for i in range(len(target_to_idx))]
for word, idx in target_to_idx.items():
    target_word_list[idx] = word

In [3]:
# train & target example
list(zip(train_data[1], train_target[1]))

[('<SOS>', 'tf.variable_scope'),
 ('tf.variable_scope', 'tf.get_variable_scope'),
 ('tf.get_variable_scope', 'tf.get_variable_scope'),
 ('tf.get_variable_scope', 'tf.nn.relu'),
 ('tf.nn.relu', 'tf.layers.batch_normalization'),
 ('tf.layers.batch_normalization', 'tf.layers.conv2d'),
 ('tf.layers.conv2d', 'tf.nn.relu'),
 ('tf.nn.relu', 'tf.layers.batch_normalization'),
 ('tf.layers.batch_normalization', 'tf.layers.conv2d'),
 ('tf.layers.conv2d', 'tf.nn.relu'),
 ('tf.nn.relu', 'tf.layers.batch_normalization'),
 ('tf.layers.batch_normalization', 'tf.layers.conv2d'),
 ('tf.layers.conv2d', 'tf.nn.relu'),
 ('tf.nn.relu', 'tf.layers.batch_normalization'),
 ('tf.layers.batch_normalization', 'tf.layers.conv2d'),
 ('tf.layers.conv2d', 'tf.layers.dense'),
 ('tf.layers.dense', 'tf.reshape'),
 ('tf.reshape', 'tf.nn.sigmoi'),
 ('tf.nn.sigmoi', '<EOS>')]

In [7]:
def prepare_sequence(seq, idx_dict):
    idxs = [idx_dict[w] if w in idx_dict else idx_dict[UNKNOWN_WORD]  for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

def predict_topN(score, N):
    return [target_word_list[i] for i in np.argsort(score)[::-1][:N]]

In [8]:
# reference: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

class LSTMModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to target space
        self.decoder = nn.Linear(hidden_dim, target_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        target_space = self.decoder(lstm_out.view(len(sentence), -1))
        scores = F.log_softmax(target_space)
        return scores
    

In [9]:
## initialize LSTM model

# parameters for LSTM model
EMBEDDING_DIM = 32
HIDDEN_DIM = 16

# parameters for training
EPOCHS = 10

model = LSTMModel(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(target_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [12]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(train_data[1], word_to_idx)
scores = model(inputs)
print(scores)

# top N predictions for each word (before training)
for i,s in enumerate(scores.data.numpy()):
    print((train_data[1][i], predict_topN(s, 3)))

Variable containing:
 -5.2796 -10.9009  -6.5051  ...  -12.7272 -12.8117 -13.0472
 -5.3267 -10.1179 -12.1195  ...  -12.8338 -12.8356 -13.5973
 -4.7667  -9.1524 -12.4223  ...  -12.5350 -12.6170 -13.0003
           ...               ⋱              ...            
 -6.1656 -12.2445  -3.5532  ...  -11.4564 -11.7513 -11.6450
 -6.6516 -12.0659 -13.5046  ...  -12.6161 -12.6258 -13.2613
-10.7545 -15.5327  -0.0075  ...  -17.5458 -17.6319 -17.3537
[torch.FloatTensor of size 19x2750]

('<SOS>', ['tf.reduce_mean', 'tf.layers.dense', 'tf.variable_scope'])
('tf.variable_scope', ['tf.reduce_mean', 'tf.variable_scope', 'tf.layers.dense'])
('tf.get_variable_scope', ['tf.reduce_mean', 'tf.variable_scope', 'tf.reduce_sum'])
('tf.get_variable_scope', ['tf.variable_scope', 'tf.reduce_mean', 'tf.placeholder'])
('tf.nn.relu', ['tf.variable_scope', 'tf.nn.relu', 'tf.layers.dense'])
('tf.layers.batch_normalization', ['tf.variable_scope', 'tf.nn.relu', 'tf.nn.dropout'])
('tf.layers.conv2d', ['tf.layers.dense', '

In [86]:
%%time
import time
st = time.time()

count = 0
for epoch in range(EPOCHS): 
    for sentence, next_word in zip(train_data, train_target):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_idx)
        targets = prepare_sequence(next_word, target_to_idx)

        # Step 3. Run our forward pass.
        scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(scores, targets)
        loss.backward()
        optimizer.step()
        
        #if count % 100 == 0:
        #   print(loss.data[0])
        
        count += 1
        
    print("%d th epoch done. %f sec" % (epoch, time.time() - st))

0 th epoch done. 238.510642 sec
1 th epoch done. 483.055629 sec
2 th epoch done. 728.519669 sec
3 th epoch done. 970.186491 sec
4 th epoch done. 1213.355400 sec
5 th epoch done. 1455.156230 sec
6 th epoch done. 1693.961889 sec
7 th epoch done. 1928.861325 sec
8 th epoch done. 2163.480744 sec
9 th epoch done. 2391.612792 sec
Wall time: 39min 51s


In [87]:
# serialize model
torch.save(model.state_dict(), 'model/model_lstm.pth')

In [10]:
# load trained model ()

param = torch.load('model/model_lstm.pth')
model = LSTMModel(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(target_to_idx)) 
model.load_state_dict(param)

In [11]:
sID = 1

inputs = prepare_sequence(train_data[sID], word_to_idx)
scores = model(inputs)

# top N predictions for each word (after training)
for i,s in enumerate(scores.data.numpy()):
    print((train_data[sID][i], predict_topN(s, 3)))

('<SOS>', ['tf.Session', 'tf.placeholder', 'tf.variable_scope'])
('tf.variable_scope', ['tf.variable_scope', 'tf.placeholder', 'tf.get_variable'])
('tf.get_variable_scope', ['tf.variable_scope', 'tf.placeholder', 'tf.reduce_mean'])
('tf.get_variable_scope', ['tf.variable_scope', 'tf.layers.dense', 'tf.placeholder'])
('tf.nn.relu', ['tf.variable_scope', 'tf.layers.dense', 'tf.nn.relu'])
('tf.layers.batch_normalization', ['tf.variable_scope', 'tf.layers.dense', 'tf.nn.relu'])
('tf.layers.conv2d', ['tf.layers.dense', 'tf.variable_scope', 'tf.layers.batch_normalization'])
('tf.nn.relu', ['tf.variable_scope', 'tf.nn.relu', 'tf.matmul'])
('tf.layers.batch_normalization', ['tf.variable_scope', 'tf.nn.dropout', 'tf.nn.relu'])
('tf.layers.conv2d', ['tf.layers.dense', 'tf.layers.batch_normalization', 'tf.layers.conv2d'])
('tf.nn.relu', ['tf.variable_scope', 'tf.nn.relu', 'tf.matmul'])
('tf.layers.batch_normalization', ['tf.variable_scope', 'tf.nn.dropout', 'tf.nn.relu'])
('tf.layers.conv2d', ['t