In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
%matplotlib inline

torch.manual_seed(1)

<torch._C.Generator at 0x10df25c18>

In [2]:
## prepare training data

import glob

# special token
UNKNOWN_WORD = "<unknown>"
START_OF_SENTENCE = "<SOS>"
END_OF_SENTENCE = "<EOS>"

input_files = glob.glob("data/input/*.txt")

# view one source file as a sentence
def get_one_sentence(file):
    with open(file, "r", encoding="utf-8") as fr:
        sentence = [line[:-1] for line in fr.readlines()]
    return sentence

# training data (set of sentences)
train_data = [[START_OF_SENTENCE] + get_one_sentence(input_file) for input_file in input_files]

# target values (next words)
train_target = [sentence[1:] + [END_OF_SENTENCE] for sentence in train_data]

def get_word_to_idx_dict(sentences):
    word_to_idx = {}
    for sentence in sentences:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)
    return word_to_idx

# get word to index dictionary
word_to_idx = get_word_to_idx_dict(train_data)            
target_to_idx = get_word_to_idx_dict(train_target)    

# prepare for unknown word input
word_to_idx[UNKNOWN_WORD] = len(word_to_idx)

# target word list (idx -> word)
target_word_list = [None for i in range(len(target_to_idx))]
for word, idx in target_to_idx.items():
    target_word_list[idx] = word

In [3]:
# train & target example
list(zip(train_data[1], train_target[1]))

[('<SOS>', 'tf.expand_dims'),
 ('tf.expand_dims', 'tf.expand_dims'),
 ('tf.expand_dims', 'tf.tile'),
 ('tf.tile', 'tf.tile'),
 ('tf.tile', 'tf.zeros_like'),
 ('tf.zeros_like', 'tf.wher'),
 ('tf.wher', '<EOS>')]

In [4]:
def prepare_sequence(seq, idx_dict):
    idxs = [idx_dict[w] if w in idx_dict else idx_dict[UNKNOWN_WORD]  for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

def predict_topN(score, N):
    return [target_word_list[i] for i in np.argsort(score)[::-1][:N]]

In [5]:
# reference: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

class LSTMModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to target space
        self.decoder = nn.Linear(hidden_dim, target_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        target_space = self.decoder(lstm_out.view(len(sentence), -1))
        scores = F.log_softmax(target_space)
        return scores
    

In [6]:
## initialize LSTM model

# parameters for LSTM model
EMBEDDING_DIM = 32
HIDDEN_DIM = 16

# parameters for training
EPOCHS = 10

model = LSTMModel(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(target_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [7]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(train_data[1], word_to_idx)
scores = model(inputs)
print(scores)

# top N predictions for each word (before training)
for i,s in enumerate(scores.data.numpy()):
    print((train_data[1][i], predict_topN(s, 3)))

Variable containing:
-7.8149 -8.0807 -7.7972  ...  -7.9264 -7.8794 -7.7138
-7.7853 -7.9886 -7.9658  ...  -7.8700 -8.1554 -7.8417
-7.8079 -7.9975 -8.0047  ...  -7.8249 -8.2380 -7.8530
          ...             ⋱             ...          
-7.8615 -8.1176 -7.8774  ...  -7.9472 -7.9563 -7.8143
-7.9212 -7.9621 -7.8530  ...  -7.9641 -7.8709 -7.8023
-7.8679 -8.0431 -7.7991  ...  -8.0582 -7.9087 -7.7586
[torch.FloatTensor of size 7x2901]

('<SOS>', ['tf.contrib.rnn.LSTMStateTuple', 'tf.baselines.ppo', 'tf.one_ho'])
('tf.expand_dims', ['tf.contrib.legacy_seq2seq.model_with_buckets', 'tf.RegisterShape', 'tf.contrib.rnn.LSTMStateTuple'])
('tf.expand_dims', ['tf.contrib.distribution', 'tf.contrib.legacy_seq2seq.model_with_buckets', 'tf.RegisterShape'])
('tf.tile', ['tf.contrib.distribution', 'tf.loggin', 'tf.nn.atrous_conv2d'])
('tf.tile', ['tf.nn.atrous_conv2d', 'tf.loggin', 'tf.assign_ad'])
('tf.zeros_like', ['tf.nn.atrous_conv2d', 'tf.sys.stdi', 'tf.truncated_normal_initialize'])
('tf.wher', ['

In [8]:
%%time
import time
st = time.time()

count = 0
for epoch in range(EPOCHS): 
    for sentence, next_word in zip(train_data, train_target):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_idx)
        targets = prepare_sequence(next_word, target_to_idx)

        # Step 3. Run our forward pass.
        scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(scores, targets)
        loss.backward()
        optimizer.step()
        
        #if count % 100 == 0:
        #   print(loss.data[0])
        
        count += 1
        
    print("%d th epoch done. %f sec" % (epoch, time.time() - st))

0 th epoch done. 528.268389 sec
1 th epoch done. 1075.930897 sec
2 th epoch done. 1682.027620 sec
3 th epoch done. 2199.040442 sec
4 th epoch done. 2699.891764 sec
5 th epoch done. 3374.694988 sec
6 th epoch done. 3997.665820 sec
7 th epoch done. 4652.084209 sec
8 th epoch done. 5221.620618 sec
9 th epoch done. 5713.431751 sec
CPU times: user 2h 35min 23s, sys: 5min 52s, total: 2h 41min 16s
Wall time: 1h 35min 13s


In [14]:
# serialize model
torch.save(model.state_dict(), 'model/model_lstm.pth')

np.save('model/word_to_idx.npy', word_to_idx)
np.save('model/target_to_idx.npy', target_to_idx)

In [15]:
# load trained model ()

word_to_idx = np.load('model/word_to_idx.npy').item()
target_to_idx = np.load('model/target_to_idx.npy').item()

param = torch.load('model/model_lstm.pth')
model = LSTMModel(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(target_to_idx)) 
model.load_state_dict(param)

In [16]:
sID = 1

inputs = prepare_sequence(train_data[sID], word_to_idx)
scores = model(inputs)

# top N predictions for each word (after training)
for i,s in enumerate(scores.data.numpy()):
    print((train_data[sID][i], predict_topN(s, 3)))

('<SOS>', ['tf.test.TestCase', 'tf.contrib.slim', 'tf.placeholder'])
('tf.expand_dims', ['tf.constant', 'tf.placeholder', 'tf.expand_dims'])
('tf.expand_dims', ['tf.constant', 'tf.expand_dims', 'tf.reshape'])
('tf.tile', ['tf.cast', 'tf.constant', 'tf.expand_dims'])
('tf.tile', ['tf.cast', 'tf.reshape', 'tf.shape'])
('tf.zeros_like', ['tf.reduce_mean', 'tf.shape', 'tf.reduce_sum'])
('tf.wher', ['<EOS>', 'tf.reduce_mean', 'tf.reduce_sum'])
