In [1]:
using Revise
using Flux
using Zygote
using MLDatasets
using LSHFunctions
using DataStructures
using Plots;
using Profile;
using StatProfilerHTML;
using BenchmarkTools
using LinearAlgebra;
using SparseArrays
include("./Optim.jl");
using .Optim

In [3]:
#Parameters
const batch_size = 16
const k = 5  #k,L= 7,10
const L = 10;
#const sample_rate = 0.1 # 1%, proportion of nodes to sample from matrix

In [4]:
function onehotencode(y)
    t = zeros(Float64,10)
    t[y[]+1] = 1.0
    return t
end

onehotencode (generic function with 1 method)

In [5]:
train_x, train_y = MNIST.traindata();
train_x = permutedims(train_x,(3,2,1))
x_train = reshape(train_x,(size(train_x)[1],prod(size(train_x)[2:end])));
y_train = [transpose(onehotencode(y)) for y in train_y]
y_train = vcat(y_train...);
x_train = convert(Array{Float64}, x_train);

In [6]:
test_x, test_y = MNIST.testdata();
test_x = permutedims(test_x,(3,2,1))
x_test = reshape(test_x,(size(test_x)[1],prod(size(test_x)[2:end])));
y_test = [transpose(onehotencode(y)) for y in test_y]
y_test = vcat(y_test...);
x_test = convert(Array{Float64}, x_test);

In [7]:
#tid = rand(1:60000)
#println(y_train[tid,:])
#Gray.(reshape(x_train[tid,:],(28,28)))

In [8]:
moving_average(vs,n) = [sum(@view vs[i:(i+n-1)])/n for i in 1:(length(vs)-(n-1))]

moving_average (generic function with 1 method)

In [9]:
function comb(n, len) 
    Iterators.product(fill(BitArray([0,1]), len)...) |> collect |> vec 
end

comb (generic function with 1 method)

In [10]:
all_hash_codes = comb(1,k);

In [11]:
mutable struct LSHLayer
    theta::Matrix #original
    bias::Matrix #original
    thetaV::SubArray
    biasV::SubArray
    hash_funs::Vector{SimHash}
    hash_tables::Vector{Dict{Tuple,CircularBuffer{Integer}}}
    #rows::Vector
    #cols::Vector
    sampled::Vector
end

mutable struct FixLayer
    theta::Matrix #original
    bias::Matrix #original
    thetaV::SubArray
    biasV::SubArray
    #rows::Vector
end

In [12]:
#The `Layer` type constructor
function LSHLayer(in_dim::Integer,out_dim::Integer,k=6,L=6,bin_size=10)
    div_n = 10
    theta = randn(in_dim,out_dim) / div_n
    bias = randn(1,out_dim) / div_n
    cols = size(theta)[2] #number of columns/nodes
    hash_funs = [LSHFunction(cossim, k) for i in 1:L]
    hash_tables = Vector{Dict{Tuple,CircularBuffer{Integer}}}()
    for i in 1:L #create L hash tables
        ht_l::Dict{Tuple,CircularBuffer{Integer}} = Dict{Tuple,CircularBuffer{Integer}}((x) => CircularBuffer{Integer}(bin_size) for x in all_hash_codes)
        push!(hash_tables,ht_l)
        for j in 1:cols
            hash_lj = hash_funs[i](theta[:,j])
            hash_lj = Tuple(hash_lj)
            push!(hash_tables[i][hash_lj],j)
        end
    end
    LSHLayer(theta,bias,view(theta,:,:),view(bias,:,:),hash_funs,hash_tables,[])
end

function FixLayer(in_dim::Integer,out_dim::Integer)
    div_n = 10
    theta = randn(in_dim,out_dim) / div_n
    bias = randn(1,out_dim) / div_n
    FixLayer(theta,bias,view(theta,:,:),view(bias,:,:))
end

FixLayer

In [13]:
function sample_nodes(query::Vector, layer::LSHLayer)
    #`query` is the input vector for this layer
    S = Set{Int64}()
    num_ht = layers[1].hash_funs |> length
    for i in 1:num_ht
        # compute hash of query using each hashfun
        q_hash = layer.hash_funs[i](query) |> Tuple
        matches = layer.hash_tables[i][q_hash]
        union!(S,matches)
    end
    if isempty(S)
        push!(S,rand(1:size(layer.theta)[2]))
    end
    return S |> collect
end

sample_nodes (generic function with 1 method)

In [14]:
Flux.trainable(a::LSHLayer) = (a.thetaV,a.biasV)
Flux.trainable(a::FixLayer) = (a.thetaV,a.biasV)

In [15]:
function get_view(a::Matrix,r,c)
    #r,c are arrays, if empty [] then return all elements
    d1,d2 = size(a);
    return view(a, isempty(r) ? (:) : r, isempty(c) ? (:) : c)
end

get_view (generic function with 1 method)

In [16]:
# Define the function that runs an LSH layer
function (m::LSHLayer)(X::Matrix,rows::Vector)
    Zygote.ignore() do
        cols = sample_nodes(vec(X),m) #rand(1:size(m.theta)[2],90)
        cols = sort(cols);
        m.thetaV = get_view(m.theta,rows,cols)
        m.biasV = get_view(m.bias,[],cols)
        m.sampled = cols
    end
    y = X * m.thetaV .+ m.biasV
end
#no activation function applied yet

function (m::FixLayer)(X::Matrix,rows::Vector)
    Zygote.ignore() do
        m.thetaV = get_view(m.theta,rows,[])
        m.biasV = get_view(m.bias,[],[])
    end
    y = X * m.thetaV .+ m.biasV
end

In [17]:
dim2, dim3 = 1000, 500
layer1 = LSHLayer(784,dim2, k, 10) #k,L
#layer1 = FixLayer(784,dim2)
#layer2 = LSHLayer(dim2,dim3, k, L)
layer2 = FixLayer(dim2,10)
layers = [layer1, layer2];

In [18]:
function model(X::Matrix,layers::Vector)
    #layer 1
    X = Flux.normalise(X;dims=ndims(X), ϵ=1e-5)
    A1 = layers[1](X,Vector{Integer}[])
    A1 = NNlib.relu.(A1)
    #layer 2
    #rows1 = isempty(layers[1].cols) ? sample_nodes(vec(X),layers[1]) : layers[2].cols
    #=
    A1 = Flux.normalise(A1;dims=ndims(A1), ϵ=1e-5)
    A2 = layers[2](A1,layers[1].sampled)
    #println(size(A2),length(n2))
    A2 = NNlib.sigmoid.(A2)
    =#
    A2 = Flux.normalise(A1;dims=ndims(A1), ϵ=1e-5)
    A2 = layers[2](A2,layers[1].sampled)#layers[1].sampled
    A2 = NNlib.softmax(A2,dims=2)
end

#function model(X::Matrix,layers::Vector{Layer})
#    model_(X, layers)[1]
#end

model (generic function with 1 method)

In [19]:
function update_htables(layer::LSHLayer)
    num_ht = length(layer.hash_funs)
    cols = size(layer.theta)[2]
    for i in 1:num_ht #iterate tables
        for j in 1:cols
            hash_lj = layer.hash_funs[i](layer.theta[:,j])
            hash_lj = Tuple(hash_lj)
            push!(layer.hash_tables[i][hash_lj],j)
        end
    end
end

update_htables (generic function with 1 method)

In [20]:
lossfn(ŷ::Vector,y::Vector) = -1.0 * LinearAlgebra.dot(log.(ŷ),y)

function lossfn2(x::Matrix,layers::Vector,y::Vector)
    ŷ = vec(model(x, layers));
    l = -1.0 * LinearAlgebra.dot(log.(ŷ),y)
end

lossfn2 (generic function with 1 method)

In [21]:
#S = [1,50,90,112,145,240,300,301,500,505,506,511];
#g = Zygote.gradient(w -> lossfn(vec(model(randn(1,784),w,S)),[1.0,0,0,0,0,0,0,0,0,0]),layers)
#g = Zygote.gradient((ypred,ytrue) -> lossfn(ypred,ytrue),(A2,batch_y))
#println(g);

In [22]:
function get_params_view(layers)
    #idx: [rows1,cols2,rows2,cols2]
    p = []
    #i = 0
    for m in layers
        #theta = get_view(m.theta,m.rows,m.cols)
        #bias = get_view(m.bias,[],m.cols)
        append!(p,[m.thetaV,m.biasV])
        #i+=1
    end
    return p
end

get_params_view (generic function with 1 method)

In [23]:
function convert_grads(gs)
    g = []
    for l in gs[1]
        append!(g,[l[][:thetaV],l[][:biasV]])
    end
    g
end

function convert_grads2(grads, layers)
    gs = []
    iz = []
    for (l,g) in zip(layers,grads[1])
        append!(gs,[g[][:thetaV],g[][:biasV]])
        isdefined(l,:sampled) ? push!(iz,l.sampled) : push!(iz,[])
    end
    collect(zip(gs,iz))
end

convert_grads2 (generic function with 1 method)

In [24]:
#get_params_view(layers)

In [25]:
#g[1][1][][:theta]

In [26]:
model(randn(1,784), layers)

1×10 Matrix{Float64}:
 0.065631  0.0547397  0.00787681  0.100399  …  0.241617  0.155482  0.127214

In [27]:
#lossfn2(randn(1,784),layers,[1,0,0,0,0,0,0,0,0,0])

In [43]:
function train(x_train,y_train,layers,epochs=200000)
    N₀ = 50
    λ₀ = 0.6
    lr = 0.001
    update_freq = round.([N₀ * exp(λ₀*x) for x in LinRange(0,5,epochs)])
    opt = SGDM(lr, 0.9, [])#ADAM(lr)
    lossarr = []
    updates = 0
    Threads.@threads for i in 1:epochs
        rid = rand(1:60000)
        x = float(x_train[rid,:])
        x = reshape(x,(1,784))
        yt = y_train[rid,:]
        l, gs = withgradient((ms) -> lossfn2(x,ms,yt), layers)
        push!(lossarr,l)
        gs = convert_grads(gs)
        ps = get_params_view(layers);
        step!(opt,ps,gs)
        #=
        if i % update_freq[i] == 0
            update_htables(layers[1])
            updates += 1 
        end
        =#
    end
    println("Num hash table updates: $updates")
    return lossarr
end

train (generic function with 2 methods)

In [29]:
#opt = SGDM(0.001, 0.9, [])

In [30]:
#=gs_ = gradient((ms) -> lossfn2(randn(1,784),ms,y_train[1,:]), layers)
gs = convert_grads(gs_)
ps = get_params_view(layers)
step3!(opt,ps,gs)=#

In [31]:
#=
for (a,b) in zip(ps,gs)
    println(size(a),size(b))
end
=#

In [32]:
#step!(opt,ps,gs)

In [33]:
#convert_grads(gs)[4]

In [34]:
#=
ps_ = [layers[1].thetaV,layers[1].biasV,layers[2].thetaV,layers[2].biasV]#get_params_view(layers);
ps = Flux.Params(ps_);
gs = Flux.gradient(ps) do
    #lossfn2(reshape(x_train[1,:],(1,784)),layers,y_train[1,:])
    model(reshape(x_train[1,:],(1,784)),layers) |> sum
end
=#

In [35]:
#layers[1].thetaV .-= gs[1][1][][:thetaV]

In [36]:
#train(x_train,y_train,layers,50);

In [37]:
#@profilehtml train(x_train,y_train,layers,50000);

In [45]:
#@profilehtml train(x_train,y_train,1000)
@time begin
    lossarr = train(x_train,y_train,layers,25000);
end;
# naive 50k hash updates
# exponential decay: 1592 hash updates
#192 seconds LSH
#       fixed

LoadError: TaskFailedException

[91m    nested task error: [39mDimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 90 and 88")
    Stacktrace:
      [1] [0m[1m_bcs1[22m
    [90m    @ [39m[90m./[39m[90;4mbroadcast.jl:501[0m[90m [inlined][39m
      [2] [0m[1m_bcs[22m[90m (repeats 2 times)[39m
    [90m    @ [39m[90m./[39m[90;4mbroadcast.jl:495[0m[90m [inlined][39m
      [3] [0m[1mbroadcast_shape[22m
    [90m    @ [39m[90m./[39m[90;4mbroadcast.jl:489[0m[90m [inlined][39m
      [4] [0m[1mcombine_axes[22m
    [90m    @ [39m[90m./[39m[90;4mbroadcast.jl:484[0m[90m [inlined][39m
      [5] [0m[1minstantiate[22m
    [90m    @ [39m[90m./[39m[90;4mbroadcast.jl:266[0m[90m [inlined][39m
      [6] [0m[1mmaterialize[22m[0m[1m([22m[90mbc[39m::[0mBase.Broadcast.Broadcasted[90m{Base.Broadcast.DefaultArrayStyle{2}, Nothing, typeof(+), Tuple{Matrix{Float64}, Matrix{Float64}}}[39m[0m[1m)[22m
    [90m    @ [39m[90mBase.Broadcast[39m [90m./[39m[90;4mbroadcast.jl:883[0m
      [7] [0m[1mstep![22m[0m[1m([22m[90mopt[39m::[0mSGDM, [90mtheta[39m::[0mVector[90m{Any}[39m, [90mgrad[39m::[0mVector[90m{Any}[39m[0m[1m)[22m
    [90m    @ [39m[35mMain.Optim[39m [90m~/Dropbox/My Mac (MacBook-Air.local)/Desktop/SLIDE Pose Estimation/[39m[90;4mOptim.jl:40[0m
      [8] [0m[1mmacro expansion[22m
    [90m    @ [39m[90m./[39m[90;4mIn[43]:18[0m[90m [inlined][39m
      [9] [0m[1m(::var"#3#threadsfor_fun#23"{Matrix{Float64}, Matrix{Float64}, Vector{Any}, Vector{Any}, SGDM, UnitRange{Int64}})[22m[0m[1m([22m[90monethread[39m::[0mBool[0m[1m)[22m
    [90m    @ [39m[36mMain[39m [90m./[39m[90;4mthreadingconstructs.jl:81[0m
     [10] [0m[1m(::var"#3#threadsfor_fun#23"{Matrix{Float64}, Matrix{Float64}, Vector{Any}, Vector{Any}, SGDM, UnitRange{Int64}})[22m[0m[1m([22m[0m[1m)[22m
    [90m    @ [39m[36mMain[39m [90m./[39m[90;4mthreadingconstructs.jl:48[0m

In [44]:
plot(moving_average(lossarr,1000)) #y top is > 3, x right is 2 x 10^5
#plot(lossarr)

LoadError: UndefVarError: lossarr not defined

In [40]:
function test_acc(xs,ys)
    ncorr = 0
    ntot = size(ys)[1]
    for i in 1:ntot
        #rid = rand(1:size(xs)[1])
        x = float(xs[i,:])
        x = reshape(x,(1,784))
        yt = ys[i,:]
        ŷ = vec(model(x, layers));
        if argmax(ŷ) == argmax(yt)
            ncorr += 1
        end
    end
    println(100 * (ncorr/ntot))
end

test_acc (generic function with 1 method)

In [41]:
test_acc(x_test,y_test)

10.08


In [None]:
#63.42 -50k, 1000dim, without hash table updates
#63.83, with hash table updates
#81.56 with %50 hash updates
#80.21 without hash updates
#80.16 with exp decy hash updates
#81.3  with %25 updates
#72.78% without hashing (random columns)
#80.76 with hashing and sorted columns
#84.95 div_n = 5
#90.35 with k,L=5,7 div_n=5
#90.66 k,L = 5,10
#90.36     k,L = 5,20 div_n=5
#93.55 k,L=5,10 with SGDM
#97.06 with 2 fix layer and SGDM, but 1091.35 seconds run-time

## Archive

```julia
function get_params_view(layers,idx)
    #idx: [rows1,cols2,rows2,cols2]
    p = []
    i = 0
    for layer in layers
        theta = get_view(layer.theta,idx[2*i+1],idx[2*i+2])
        bias = get_view(layer.bias,[],idx[2*i+2])
        append!(p,[theta,bias])
        i+=1
    end
    return p
end
```

```julia
function (m::Layer)(X::Matrix,rows,cols)
    e1 = isempty(cols)
    e2 = isempty(rows)
    if e1 & e2 # neither rows nor cols given
        y = X * m.theta .+ m.bias
    elseif e2 #cols given alone
        y = X * (@view m.theta[:,cols]) .+ (@view m.bias[:,cols]);
    elseif e1 #rows given alone
        y = X * (@view m.theta[rows,:]) .+ m.bias;
    else # rows and cols given
        y = X * (@view m.theta[rows,cols]) .+ (@view m.bias[:,cols]);
    end
    return y
end
```

```julia
function rescale(x_)
    x = copy(x_)
    mn = abs(minimum(x))
    x = x .+ mn
    mx = maximum(x)
    x = x ./ mx
    x
endfunction rescale(x_)
    x = copy(x_)
    mn = abs(minimum(x))
    x = x .+ mn
    mx = maximum(x)
    x = x ./ mx
    x
end
```