In [1]:
require 'hdf5'
require 'nn'
require 'nngraph'
require 'randomkit'
require 'distributions'
P = require 'itorch.Plot'

# PRELIMINARY

The code being in Lua/Torch, we can point you to these tutorials made by Sam Wiseman in CS287 to get a good overview of how to code in Lua:
- Basic Lua: http://nbviewer.jupyter.org/urls/cs287.github.io/Lectures/notebooks/TorchTutorial.ipynb
- Neural Network: http://nbviewer.jupyter.org/urls/cs287.github.io/Lectures/notebooks/NNTutorial.ipynb


## Max-Entropy Markov Model (MEMM)

We first define all the necessary functions:
- Train function
- An accuracy function
- A fonction that outputs the log-score for MEMM computation
- A viterbi implementation
- An f-score function

In [19]:
function train_model_(train_inputs, train_outputs, test_inputs, test_outputs, model, criterion, eta, nEpochs, batch)
    -- Train the model with a SGD
    
    -- Define the default values of parameters, as well as allocate memory:
    local batch = batch or 16
    local loss = torch.zeros(nEpochs)
    local av_L = 0
    local f = 0
    local df_do
    local len = train_inputs:size(2)
    for i = 1, nEpochs do
        -- Display progess
        xlua.progress(i, nEpochs)

        -- timing the epoch
        local timer = torch.Timer()
        av_L = 0
        
        for ii = 1, train_inputs:size(1), batch do
            
            -- Batchsize:
            current_batch_size = math.min(batch,train_inputs:size(1)-ii)
            
            -- reset gradients:
            model:zeroGradParameters()

            -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
            -- Evaluate the outputs predicted by the model:
            local pred = model:forward(train_inputs:narrow(1, ii, current_batch_size))
            -- Evaluate the loss (Non Negative Likelihood):
            f = criterion:forward(pred, train_outputs:narrow(1, ii, current_batch_size))
            av_L = av_L + f
            
            -- Evaluate gradients wrt to the loss
            df_do = criterion:backward(pred, train_outputs:narrow(1, ii, current_batch_size))
            -- Propagate backwards the gradients
            model:backward(train_inputs:narrow(1, ii, current_batch_size), df_do)
            -- Stochastic gradient descent update
            model:updateParameters(eta)
            
        end

        loss[i] = av_L/math.floor(train_inputs:size(1)/batch)
        acc_test = accuracy_(test_inputs, test_outputs, model)
        print('Epoch '..i..': '..timer:time().real)
        print('\n')
        print('Average Loss on Train: '.. loss[i])
        print('\n')
        print('Accucary on Validation: '.. acc_test)
        print('***************************************************')
        if acc_test > 0.99 then
            break
        end
    end

    return loss
end

-- Evaluate accuracy:
function accuracy_(input, output, model)

    local acc = 0.

    for i = 1, input:size(1) do
        -- Predict a distribution given input
        pred = model:forward(input[i])
        m, a = pred:view(6,1):max(1)

        if a[1][1] == output[i] then
            acc = acc + 1.
        end
    end

    return acc/input:size(1)

end

-- Computes logscore at timestep i i.e. log probabilities given all classes at previous step:
function compute_logscore_(inputs, i, model, C)
    local y = torch.zeros(C,C)
    local hot_1 = torch.zeros(C)
    for j = 1, C do
        -- Previous Class
        hot_1:zero()
        hot_1[j] = 1
        -- Store Score
        y:narrow(1,j,1):copy(model:forward(torch.cat(hot_1,inputs[i],1)))
    end
    return y
end

-- Evaluates the highest scoring sequence:
function viterbi_(inputs, init, compute_logscore, model, C)
    
    local y = torch.zeros(C,C)
    -- Formating tensors
    local initial = torch.zeros(C, 1)
    -- initial started with a start of sentence: <t>

    initial[{init,1}] = 1
    initial:log()

    -- number of classes
    local n = inputs:size(1)
    local max_table = torch.Tensor(n, C)
    local backpointer_table = torch.Tensor(n, C)
    -- first timestep
    -- the initial most likely paths are the initial state distribution
    local maxes, backpointers = (initial + compute_logscore_(inputs, 1, model, C)[init]):max(2)
    max_table[1] = maxes
    -- remaining timesteps ("forwarding" the maxes)
    for i=2,n do
        -- precompute edge scores
       
        y:copy(compute_logscore_(inputs, i, model, C))
        scores = y:transpose(1,2) + maxes:view(1, C):expand(C, C)

        -- compute new maxes 
        maxes, backpointers = scores:max(2)

        -- record
        max_table[i] = maxes
        backpointer_table[i] = backpointers
    end
    -- follow backpointers to recover max path
    local classes = torch.Tensor(n)
    maxes, classes[n] = maxes:max(1)
    for i=n,2,-1 do
        classes[i-1] = backpointer_table[{i, classes[i]}]
    end

    return classes
end

-- Evaluate accuracy of the path
function path_accuracy(pred_path, true_path)

    local acc = 0.
    local path_length = pred_path:size(1)
    
    for i = 1, path_length do
        if pred_path[i] == true_path[i] then
            acc = acc + 1.
        end
    end

    return acc/path_length

end

We now load the data:

In [30]:
myFile = hdf5.open('../HAR/preprocessed_1.hdf5','r')
f = myFile:all()
myFile:close()

In [31]:
x = f['x_train_with_past']
y = f['y_train_with_past']
x_test = f['x_test_with_past']
y_test = f['y_test_with_past']
x_test_withoutpast = f['x_test']
y_test_withoutpast = f['y_test']

n = x:size(1)
torch.manualSeed(1)
perm = torch.randperm(n):long()
x_train = x:index(1,perm):narrow(1,1,math.floor(0.9*n))
y_train = y:index(1,perm):narrow(1,1,math.floor(0.9*n))
x_val = x:index(1,perm):narrow(1,math.floor(0.9*n)+1, n-math.floor(0.9*n))
y_val = y:index(1,perm):narrow(1,math.floor(0.9*n)+1, n-math.floor(0.9*n))

We now define a simple Multi-Layer Perceptron using the nn model:

In [37]:
model = nn.Sequential()
model:add(nn.Linear(567,600))
model:add(nn.Tanh())
model:add(nn.Linear(600,300))
model:add(nn.Tanh())
model:add(nn.Linear(300,6))
model:add(nn.LogSoftMax())

We initialise the weigths uniformly between -0.05 and 0.05:

In [38]:
parameters, gradParameters = model:getParameters()
torch.manualSeed(0)
randomkit.uniform(parameters,-0.05,0.05)

We define the loss function, i.e. negative log-likelihood:

In [39]:
criterion = nn.ClassNLLCriterion()

We train the model, and stop when performance on validation/test is aborve 99%:

In [40]:
loss = train_model_(x_train, y_train, x_val, y_val, model, criterion, 0.01, 20, 16)

Epoch 1: 1.6464028358459	

	
Average Loss on Train: 0.86671255328499	

	
Accucary on Validation: 0.90625	
***************************************************	


Epoch 2: 1.6089570522308	

	
Average Loss on Train: 0.28936687498716	

	
Accucary on Validation: 0.95652173913043	
***************************************************	


Epoch 3: 1.6274170875549	

	
Average Loss on Train: 0.12721662849995	

	
Accucary on Validation: 0.98233695652174	
***************************************************	


Epoch 4: 1.5920560359955	

	
Average Loss on Train: 0.070152062749231	

	
Accucary on Validation: 0.98641304347826	
***************************************************	
Progress: 5 / 20	


Epoch 5: 1.8362309932709	

	
Average Loss on Train: 0.04656034099656	

	
Accucary on Validation: 0.98913043478261	
***************************************************	


Epoch 6: 1.6208820343018	

	
Average Loss on Train: 0.034651948400027	

	
Accucary on Validation: 0.9945652173913	
***************************************************	


We predict a path on the test set:

In [41]:
input_test = x_test_withoutpast:narrow(1,2,x_test_withoutpast:size(1)-1)
predicted_path = viterbi_(input_test, y_test_withoutpast[1], compute_logscore, model, 6)

And evaluate accuracy:

In [42]:
print(path_accuracy(predicted_path, y_test))

0.91989137813985	


## Simulated Annealing for Optimal MLP architecture

Building the right architecture for an MLP is more of an art than a science. How many layers? How dense or sparse features? What type of activation layer, how many of these? We now present a simulated annealing algorithm that can help discover the optimal architecture. The goal is to maximise accuracy on the validation set by sampling a new architecture at every iteration. This new architecture is accepted with probability depending on the difference between the current accuracy and the accuracy of the new sampled model.

The algorithm is not yet capable of inventing a random architecture, it is more a question of pruning, altering a a general, "maximal" architecture. Given a current architecture, we generate a new architecture by perturbing one layer other than the final layer. By perturbing we mean:
- activating a layer of the "maximal" architecture
- deleting a layer
- modifying the hidden dimensions
- adding an activation function between layer
- concatenating part of the inputs at various places in the architecture

In [2]:
-- HELPER (taken from https://gist.github.com/MihailJP/3931841)
function table_copy (t) 
    if type(t) ~= "table" then return t end
    local meta = getmetatable(t)
    local target = {}
    for k, v in pairs(t) do target[k] = v end
    setmetatable(target, meta)
    return target
end

In [3]:
-- Define a model given a parametrisation
-- We used the 'nngraph' module for this task. The main advantage of the nngraph module over the regular nn
-- Is that it allows us to be more flexible on how you use inner layers and how to connect them
-- The basic idea is that every nn module is transformed into a node in a network by adding a () to its call:
-- e.g. nn.Linear(4,4)()
function buildmodel(lookup, lt_hid, link1, hid1, activ1, link2, hid2, activ2, link3, activ3)
    
    -- Define default variables
    local lookup = lookup or false
    local lt_hid = lt_hid or 6
    local link1 = link1 or true
    local hid1 = hid1 or 600
    local activ1 = activ1 or true
    local link2 = link2 or false
    local hid2 = hid2 or 300
    local activ2 = activ2 or true
    local link3 = link3 or false
    local activ3 = activ3 or false

    -- Define inputs
    prev_class = nn.Identity()()
    obs = nn.Identity()()

    -- Embed the classes using a lookup table
    if lookup == true then
        prev_ = nn.Narrow(2,1,1)(prev_class)
        prev = nn.View(-1,lt_hid)(nn.LookupTable(6,lt_hid)(prev_))
        len_prev = lt_hid
    else
        prev = nn.Narrow(2,2,6)(prev_class)
        len_prev = 6
    end

    -- Concat the prev class or not ?
    -- Apply a first linear transformation
    if link1 == true then
        layer1 = nn.Linear(561 + len_prev, hid1)(nn.JoinTable(2)({prev,obs}))
    else
        layer1 = nn.Linear(561, hid1)(obs)
    end

    -- Activate output of previous layer using Tanh
    if activ1 == true then
        layer2 = nn.Tanh()(layer1)
    else
        layer2 = layer1
    end

    -- Concat the prev class or not ?
    -- Apply a second linear transformation
    if link2 == true then
        layer3 = nn.Linear(hid1+len_prev, hid2)(nn.JoinTable(2)({prev,layer2}))
    else
        layer3 = nn.Linear(hid1, hid2)(layer2)
    end
    
    -- Activate output of previous layer using Tanh
    if activ2 == true then
        layer4 = nn.Tanh()(layer3)
    else
        layer4 = layer3
    end

    -- Concat the prev class or not ?
    -- Apply a third linear transformation
    if link3 == true then
        layer5 = nn.Linear(hid2+len_prev, 6)(nn.JoinTable(2)({prev,layer4}))
    else
        layer5 = nn.Linear(hid2, 6)(layer4)
    end
    -- Activate output of previous layer using Tanh
    if activ3 == true then
        layer6 = nn.Tanh()(layer5)
    else
        layer6 = layer5
    end

    -- Define output, by taking a logsoftmax on previous output (distribution over the 6 classes)
    out = nn.LogSoftMax()(layer6)

    return nn.gModule({prev_class, obs}, {out})
end

function buildmodel_fromtable(tab)
    return buildmodel(tab[1], tab[2], tab[3], tab[4], tab[5], tab[6], tab[7], tab[8], tab[9], tab[10])
end

In [4]:
-- Assess accuracy of a model input by input
function accuracy(input_1, input_2, output, model)
    local acc = 0.
    for i = 1, input_1:size(1) do
        pred = model:forward({input_1:narrow(1,i,1),input_2:narrow(1,i,1)})
        m, a = pred:max(2)
        if a[1][1] == output[i] then
            acc = acc + 1.
        end
    end
    return acc/input_1:size(1)
end

-- Assess acccuracy on a predicted path
function path_accuracy(pred_path, true_path)
    local acc = 0.
    local path_length = pred_path:size(1)
    for i = 1, path_length do
        if pred_path[i] == true_path[i] then
            acc = acc + 1.
        end
    end
    return acc/path_length
end

In [51]:
 -- Train the model with a SGD
function train_model(train_inputs_1, train_inputs_2, train_outputs, val_inputs_1, val_inputs_2, val_outputs, model, criterion, eta, batch, nEpochs, val_stop)
    -- Define default variables:
    local batch = batch or 16
    local loss = torch.zeros(nEpochs)
    local av_L = 0
    local f = 0
    local df_do
    local len = train_inputs_1:size(2)
    local ntrain = train_inputs_1:size(1)
    
    local val_stop = val_stop or True

    for i = 1, nEpochs do
        -- timing the epoch
        local timer = torch.Timer()
        av_L = 0
        
        for ii = 1, ntrain, batch do
            
            current_batch_size = math.min(batch,ntrain-ii)
            -- reset gradients
            model:zeroGradParameters()

            -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
            local pred = model:forward({train_inputs_1:narrow(1, ii, current_batch_size),train_inputs_2:narrow(1, ii, current_batch_size)})
            -- Average loss computation
            f = criterion:forward(pred, train_outputs:narrow(1, ii, current_batch_size))
            av_L = av_L + f
            
            -- Backward pass
            df_do = criterion:backward(pred, train_outputs:narrow(1, ii, current_batch_size))
            model:backward({train_inputs_1:narrow(1, ii, current_batch_size),train_inputs_2:narrow(1, ii, current_batch_size)}, df_do)
            model:updateParameters(eta)
            
        end

        loss[i] = av_L/math.floor(ntrain/batch)
        acc_val = accuracy(val_inputs_1, val_inputs_2, val_outputs, model)
        if acc_val > 0.99 and val_stop then
            return loss[i], acc_val, loss
        end
    end
    return loss[nEpochs], acc_val, loss
    
end

In [6]:
-- Generate new architecture, that is different from the current with probability p
function generate_archi(current, p)
    local archi = table_copy(current)
    local tochange = math.random(10)
    if torch.uniform()<p then
        if type(archi[tochange]) == 'boolean' then
            archi[tochange] = not archi[tochange]
        elseif type(archi[tochange]) == 'number' then
            sgn = math.random(0,1)
            if sgn == 0 then
                sgn = -1
            end
            temp = archi[tochange]+ sgn* math.random(20)
            if temp>6 then
                archi[tochange] = temp
            else
                archi[tochange] = 6
            end
        end
    end
    return archi
end

In [7]:
-- Evalaute cost
function cost(val_acc)
    return 1-val_acc
end

In [8]:
-- Finds optimal architecture using Simulated Annealing:
function SA(initial_archi, train_inputs_1, train_inputs_2, train_outputs, val_inputs_1, val_inputs_2, val_outputs, eta, batch, nEpochs, nIt, T, annealing)
    -- Memory allocation
    local model_table = {}
    local archi = {}
    local costs = {}
    local train_acc
    local val_acc
    local archi_prev
    local archi_new
    local cost_prev = 0
    local cost_new = 0
    local train_acc_new = 0 
    local acc_val_new = 0
    local criterion = nn.ClassNLLCriterion()
    local model_prev
    local ct = 0
    
    -- Initialisation
    model_table[1] = buildmodel_fromtable(initial_archi)
    archi[1] = initial_archi
    
    -- Train the first model and save cost:
    train_acc, val_acc = train_model(train_inputs_1, train_inputs_2, train_outputs, val_inputs_1, val_inputs_2, val_outputs, model_table[1], criterion, 0.01, batch, nEpochs)
    costs[1] = cost(val_acc)

    archi_prev = table_copy(initial_archi)
    model_prev = model_table[1]:clone()
    cost_prev = costs[1]
    
    for i = 2,nIt do
        print(i..'/'..nIt)
        -- Generate new architecture
        archi_new = generate_archi(archi_prev, 0.5)
        -- Create model associated with this architecture
        model_new = buildmodel_fromtable(archi_new) 
        -- Train the new model
        train_acc_new, val_acc_new = train_model(train_inputs_1, train_inputs_2, train_outputs, val_inputs_1, val_inputs_2, val_outputs, model_new, criterion, eta, batch, nEpochs)
        -- Evaluate cost
        cost_new = cost(val_acc_new)
        
        -- Accept new architecture with probability equal to torch.exp(-(cost_new-cost_prev)/T)
        if torch.uniform() < torch.exp(-(cost_new-cost_prev)/T) then
            archi_prev = table_copy(archi_new)
            model_prev = model_new:clone()
            cost_prev = cost_new
            ct = ct + 1
            -- Update temperature:
            if ct == 3 then
                T = T * annealing
                ct = 0
            end
        end
        model_table[i] = model_prev:clone()
        costs[i] = cost_prev
        archi[i] = table_copy(archi_prev)
    end

    return model_table[nIt-1], archi, costs, model_table
end

In [9]:
-- Load Data:
myFile = hdf5.open('../HAR/preprocessed_1.hdf5','r')
f = myFile:all()
myFile:close()

x = f['x_train_with_past2']
y = f['y_train_with_past']
x_test = f['x_test_with_past2']
y_test = f['y_test_with_past']
x_test_withoutpast = f['x_test']
y_test_withoutpast = f['y_test']

n = x:size(1)
torch.manualSeed(1)
perm = torch.randperm(n):long()
x_train = x:index(1,perm):narrow(1,1,math.floor(0.9*n))
y_train = y:index(1,perm):narrow(1,1,math.floor(0.9*n))
x_train_1 = x_train:narrow(2,1,7)
x_train_2 = x_train:narrow(2,8,561)
x_val = x:index(1,perm):narrow(1,math.floor(0.9*n)+1, n-math.floor(0.9*n))
y_val = y:index(1,perm):narrow(1,math.floor(0.9*n)+1, n-math.floor(0.9*n))
x_val_1 = x_val:narrow(2,1,7)
x_val_2 = x_val:narrow(2,8,561)

In [10]:
criterion = nn.ClassNLLCriterion()

In [11]:
T = 1
annealing = 0.8
nIt = 20
nEpochs = 10
batch = 16
eta = 0.01
initial_archi = {true, 10, true, 300, true, true, 300, true, true, true}

In [42]:
x_train_1:narrow(1, 1, 1)

 1  1  0  0  0  0  0
[torch.DoubleTensor of size 1x7]



We start with the following architecture:

##### include pic

In [15]:
opt_model, archi, costs, model_table = SA(initial_archi, x_train_1, x_train_2, y_train, x_val_1, x_val_2, y_val, eta, batch, nEpochs, nIt, T, annealing)

2/20	


3/20	


4/20	


5/20	


6/20	


7/20	


8/20	


9/20	


10/20	


11/20	


12/20	


13/20	


14/20	


15/20	


16/20	


17/20	


18/20	


19/20	


20/20	


In [16]:
toprint = torch.zeros(20)
for key,value in pairs(costs) do
    toprint[key] = value
end

In [18]:
myFile = hdf5.open('toplot_1.hdf5', 'w')
myFile:write('toplot_1', toprint)
myFile:close()

In [17]:
plot = P():line(torch.linspace(1,20,20), toprint,'red'):title('Optimal Architecture using Simulated Annealing'):draw()
plot:yaxis('1-accuracy'):xaxis('Iteration'):redraw()

In case the pluggin doesn't show on github:

<img src="one.png">

We train the model with optimal architecture on full training:

In [22]:
opt_model = buildmodel_fromtable(archi[20])
criterion = nn.ClassNLLCriterion()

In [50]:
torch.save("opt_model", opt_model)




In [24]:
x_1 = x:narrow(2,1,7)
x_2 = x:narrow(2,8,561)

In [52]:
-- train_model(train_inputs_1, train_inputs_2, train_outputs, val_inputs_1, val_inputs_2, val_outputs, model, criterion, eta, batch, nEpochs)
loss = train_model(x_1, x_2, y, x_val_1, x_val_2, y_val, opt_model, criterion, 0.01, 20, 16, False)

In [46]:
function compute_logscore(inputs, i, model, C)
    local y = torch.zeros(C,C)
    local hot_1 = torch.zeros(C+1)
    for j = 1, C do
        hot_1:zero()
        hot_1[1] = j
        hot_1[j+1] = 1
        y:narrow(1,j,1):copy(model:forward({hot_1:view(1,7),inputs:narrow(1,i,1)}))
    end
    return y
end

-- Evaluates the highest scoring sequence:
function viterbi(inputs, init, compute_logscore, model, C)
    
    local y = torch.zeros(C,C)
    -- Formating tensors
    local initial = torch.zeros(C, 1)
    -- initial started with a start of sentence: <t>

    initial[{init,1}] = 1
    initial:log()

    -- number of classes
    local n = inputs:size(1)
    local max_table = torch.Tensor(n, C)
    local backpointer_table = torch.Tensor(n, C)
    -- first timestep
    -- the initial most likely paths are the initial state distribution
    local maxes, backpointers = (initial + compute_logscore(inputs, 1, model, C)[init]):max(2)
    max_table[1] = maxes
    -- remaining timesteps ("forwarding" the maxes)
    for i=2,n do
        -- precompute edge scores
       
        y:copy(compute_logscore(inputs, i, model, C))
        scores = y:transpose(1,2) + maxes:view(1, C):expand(C, C)

        -- compute new maxes 
        maxes, backpointers = scores:max(2)

        -- record
        max_table[i] = maxes
        backpointer_table[i] = backpointers
    end
    -- follow backpointers to recover max path
    local classes = torch.Tensor(n)
    maxes, classes[n] = maxes:max(1)
    for i=n,2,-1 do
        classes[i-1] = backpointer_table[{i, classes[i]}]
    end

    return classes
end

In [48]:
input_test = x_test_withoutpast:narrow(1,2,x_test_withoutpast:size(1)-1)
predicted_path = viterbi(input_test, y_test_withoutpast[1], compute_logscore, opt_model, 6)

In [49]:
print(path_accuracy(predicted_path, y_test))

0.92837746096402	


We now compare with another starting architecture:

In [70]:
initial_archi2 = {false, 10, true, 700, true, true, 300, true, false, false}

In [71]:
opt_model2, archi2, costs2, model_table2 = SA(initial_archi2, x_train_1, x_train_2, y_train, x_val_1, x_val_2, y_val, eta, batch, nEpochs, nIt, T, annealing)

2/20	


3/20	


4/20	


5/20	


6/20	


7/20	


8/20	


9/20	


10/20	


11/20	


12/20	


13/20	


14/20	


15/20	


16/20	


17/20	


18/20	


19/20	


20/20	


In [72]:
toprint2 = torch.zeros(20)
for key,value in pairs(costs2) do
    toprint2[key] = value
end

In [73]:
plot = P():line(torch.linspace(1,20,20), toprint2,'red'):title('Optimal Architecture using Simulated Annealing'):draw()
plot:line(torch.linspace(1,20,20), toprint,'blue'):title('Optimal Architecture using Simulated Annealing'):redraw()
plot:yaxis('1-accuracy'):xaxis('Iteration'):redraw()

Similarly, if git doesn't show:
<img src="two.png">

Picking the right starting architecture can have a significant impact on the performance of the simulated annealing algorithm.