In [1]:
-- Importing libraries

include('init.lua')
include('WordIndexer.lua')
include('BatchIterator.lua')

In [2]:
-- Loading input data
local textfile = 'data/tiny_shakespeare.txt'
local indexfile = 'data/tiny_shakespeare.indexer.th'
local datafile = 'data/tiny_shakespeare.data.th'

local train_split = 0.8
local valid_split = 0.1

seq_len = 50

if paths.filep(indexfile) and paths.filep(datafile) then
    print('Loading indexer and data ...')
    indexer = torch.load(indexfile)
    data = torch.load(datafile)
else
    -- Building vocab
    print('Building vocab...')
    indexer = WordIndexer()
    local data_len = 0
    local f = assert(io.open(textfile, "r"))
    while true do
        local line = f:read()
        if not line then break end
        for c in line:gmatch('.') do
            indexer:add(c)
        end
        data_len = data_len + #line + 1
    end
    f:close()
    indexer:add('\n')
    print('Total chars: ' .. data_len)
    print('Total vocab: ' .. #indexer)

    -- Creating torch tensor
    print('Creating torch Tensor of data...')
    data = torch.ByteTensor(data_len)
    local cur_pos = 1
    local f = assert(io.open(textfile, "r"))
    while true do
        local line = f:read()
        if not line then break end
        for c in line:gmatch('.') do
            data[cur_pos] = indexer:index(c)
            cur_pos = cur_pos + 1
        end
        data[cur_pos] = indexer:index('\n')
        cur_pos = cur_pos + 1
    end
    f:close()
    -- Saving preprocessed data for later
    print('Saving data...')
    torch.save(indexfile, indexer)
    torch.save(datafile, data)
end
-- creating training batch
local len = data:size(1)
if len % (seq_len) ~= 0 then
    data = data:sub(1, seq_len * math.floor(len / seq_len))
end

labels = data:clone()
labels:sub(1,-2):copy(data:sub(2,-1))
labels[-1] = data[1]
data_seqs = data:split(seq_len)
label_seqs = labels:split(seq_len)
ntrain = math.floor(#data_seqs * train_split)
nvalid = math.floor(#data_seqs * valid_split)
ntest = #data_seqs - ntrain - nvalid
collectgarbage()
print('Total: ' .. #data_seqs)
print('Train: ' .. ntrain)
print('Valid: ' .. nvalid)
print('Test: ' .. ntest)

Loading indexer and data ...	


Total: 22307	
Train: 17845	
Valid: 2230	
Test: 2232	


In [3]:
-- constructing model
dim_word = 64
num_lstm_layers = 1
dim_cell = 128
dim_w = 128

emb = nn.LookupTable(#indexer, dim_word)
net = lstm.LSTM({input_dim=dim_word, hidden_dim=dim_cell, num_layers=num_lstm_layers})
classifier = nn.Sequential()
classifier:add(nn.Linear(dim_cell, #indexer))
classifier:add(nn.LogSoftMax())
criterion = nn.ClassNLLCriterion()

local all_net = nn.Parallel()
all_net:add(emb)
all_net:add(net)
all_net:add(classifier)
params, grad_params = all_net:getParameters()


In [4]:
-- local num_threads = torch.getnumthreads()
-- torch.setnumthreads(1)
function perplexity(start_offset, end_offset)
    local total_logprob = 0
    local num_char = 0
    local total_seqs = end_offset - start_offset + 1
    local data = torch.zeros(total_seqs, seq_len)
    for i = 1, total_seqs do
        data[{i,{}}] = data_seqs[start_offset + i -1]
    end
    net:forget()
    local rep = emb:forward(data)
    local states = net:forward(rep)
    for t = 1, seq_len do
        local logprob = classifier:forward(states[{{},t,num_lstm_layers,{}}])
        for i = 1, total_seqs do
            total_logprob = total_logprob + logprob[i][label_seqs[start_offset + i - 1][t]]
            num_char = num_char + 1
        end
    end
--     for i = start_offset, end_offset do
--         net:forget()
--         local rep = emb:forward(data_seqs[i])
--         local states = net:forward(rep)
--         for t = 1, seq_len do
--             local logprob = classifier:forward(states[{{},t,num_lstm_layers,{}}])
--             total_logprob = total_logprob + logprob[1][label_seqs[i][t]]
--             num_char = num_char + 1
--         end
--     end
    return math.exp(- total_logprob / num_char)
end
print('Training PPL: ' .. perplexity(1,ntrain))
print('Validation PPL: ' .. perplexity(ntrain+1, ntrain+nvalid))
print('Testing PPL: ' .. perplexity(ntrain+nvalid+1, ntrain+nvalid+ntest))
-- torch.setnumthreads(num_threads)

Training PPL: 64.264155858691	


Validation PPL: 64.243316978626	


Testing PPL: 64.275596458674	


In [5]:
-- training procedure
local num_threads = torch.getnumthreads()
torch.setnumthreads(1)
local batch_size = 16
local n_epochs = 5
local learning_rate = 0.01

local mb_data = torch.zeros(batch_size, seq_len)
local mb_labels = torch.zeros(batch_size, seq_len)
local set_minibatch_data = function(mb_idx)
    for i = 1, mb_idx:size(1) do
        mb_data[{i, {}}] = data_seqs[mb_idx[i]]
        mb_labels[{i, {}}] = label_seqs[mb_idx[i]]
    end
end
local mb_grad_output = torch.zeros(batch_size, seq_len, num_lstm_layers, dim_cell)
for epoch = 1, n_epochs do
    -- setting up data for this epoch
    local shuffle = torch.randperm(ntrain)
    local batch_iter = BatchIterator(shuffle, batch_size, true)
    local e_loss = 0
    local b_count = 0
    while batch_iter:has_next() do
        -- mini batch
        set_minibatch_data(batch_iter:next_batch())
        net:forget()
        grad_params:zero()
        mb_grad_output:zero()
        -- forward
        local mb_rep = emb:forward(mb_data)
        local mb_states = net:forward(mb_rep)
        local mb_loss = 0
        local mb_predict = {}
        for t = 1, seq_len do
            mb_predict[t] = classifier:forward(mb_states[{{},t,num_lstm_layers,{}}])
            mb_loss = mb_loss + criterion:forward(mb_predict[t], mb_labels[{{},t}])
            mb_grad_output[{{},t,num_lstm_layers,{}}] = classifier:backward(
                mb_states[{{},t,num_lstm_layers,{}}],
                criterion:backward(mb_predict[t], mb_labels[{{},t}]))
            classifier:updateParameters(learning_rate)
        end
        -- classifier:updateParameters(learning_rate)
        mb_loss = mb_loss / seq_len
        e_loss = e_loss + mb_loss
        b_count = b_count + 1
        emb:backward(
            mb_data,
            net:backward(mb_rep, mb_grad_output))
        -- grad_params:div(batch_size * seq_len)
        net:updateParameters(learning_rate)
        emb:updateParameters(learning_rate)
        -- classifier:updateParameters(learning_rate)
    end
    e_loss = e_loss / b_count
    torch.setnumthreads(num_threads)
    print(string.format(
            'Loss: %f, Training PPL: %f, Validation PPL: %f',
            e_loss, perplexity(1,ntrain), perplexity(ntrain+1, ntrain+nvalid)))
    torch.setnumthreads(1)
end
torch.setnumthreads(num_threads)

Loss: 2.100083, Training PPL: 6.568220, Validation PPL: 7.185524	


Loss: 1.817854, Training PPL: 5.829960, Validation PPL: 6.546280	


Loss: 1.723388, Training PPL: 5.433668, Validation PPL: 6.093143	


Loss: 1.667566, Training PPL: 5.202855, Validation PPL: 5.917058	


Loss: 1.629772, Training PPL: 5.089554, Validation PPL: 5.819674	


In [9]:
-- net:forget()
-- local rep = emb:forward(data_seqs[1])
-- local states = net:forward(rep)
-- for t = 1, seq_len do
--     local _, d = torch.max(classifier:forward(states[{{},t,num_lstm_layers,{}}]),2)
--     print(string.format('%s (%s)', indexer:word(d[1][1]), indexer:word(label_seqs[1][t])))
-- end
net:forget()
local seed = 'What'
local chars = {}
local i = 1
for c in seed:gmatch('.') do
    chars[i] = c
    i = i + 1
end
local input = indexer:indexes(chars)
local rep = emb:forward(input)
local states = net:forward(rep)
local t = #seed
local sentence = seed
for i = 1, 80 do
    local _, d = torch.max(classifier:forward(states[{{},t,num_lstm_layers,{}}]),2)
    sentence = sentence .. indexer:word(d[1][1])
    rep = emb:forward(torch.Tensor{d[1][1]})
    states = net:forward(rep)
    t = 1
end
print(sentence)

What the countent the countent the countent the countent the countent.

First Citize	
