In [None]:
using Iterators
using DataStructures
using Pipe
using Compat
using HDF5

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util")

In [None]:
############## ATIS 2 DATA
function atis_clean(ss)
    @pipe (ss 
    |> replace(_, r"\[.*?\] ?","")  #Remove Nonword sounds
    |> replace(_, r"\<.*?\> ","")  #Remove Verbal Deltions
    |> replace(_, r"\*(.*?)\*",s"\1") #Remove mispronounciation marks
    |> replace(_,r"\:|\-\s\.\s\-", "") #remove intraword pauses
    |> replace(_,r"\w+\- ","") #remove stuttered words
    |> replace(_, r"[!\.,\?]","")        #Remove punctation as it is not used traditionally (see sro spec)
    
    |> replace(_, r"\s+",' ') #Remove repeated spaces
    #|> replace(_, r"([A-Z])\s([A-Z])", s"\1\2") #Merge len(2) abbrev
    |> basic_clean
    )
end

function basic_clean(ss)
    @pipe (ss 
    |> replace(_, r"\s+",' ') #Remove repeated spaces
    |> lowercase
    |> strip 
    )
end


function valid(ss)
    typeof(ss) <: ASCIIString && length(ss)>0
end


path, ext, clean = ("../../Resources/corpora/atis2_text/", ".sro", atis_clean)
corpus = @pipe readdir(path)  |> filter!(fn -> splitext(fn)[2]==ext, _) |> map(_) do fn
    try open(readall, path*fn) end
    end |> filter!(valid,_) |> map(clean,_) |> filter!(valid,_) |> map(s->split(s),_) |> filter(ss->length(ss)>1, _);


In [None]:
const START_MARKER1 = "**START1**"
const START_MARKER2 = "**START2**"
const END_MARKER1 = "**END1**"
const END_MARKER2 = "**END2**"

In [None]:
############# Doctor's Letters, Brown
using PyCall
@pyimport nltk.corpus as nltk_corpus
#corpus_reader=nltk_corpus.PlaintextCorpusReader("../../Resources/corpora/confidential_corpora/Doctors_Letters/files/",".*\.txt")

In [None]:
corpus_reader=nltk_corpus.brown
bonus_corpus = Vector{ASCIIString}[[lowercase(word) for word in sent] for sent in (corpus_reader[:sents]()|> collect)]
bonus_corpus_vocab = @pipe bonus_corpus |> map(Set,_) |> reduce(union,_)
length(bonus_corpus)
corpus = @pipe ["name this 1922 novel about leopold bloom written by james joyce",
    "ralph waldo emerson dismissed this poet as the jingle man and james russell lowell called him three-fifths genius and two-fifths sheer fudge",
    "this is the basis of a comedy of manners first performed in 1892",
    "in a third novel a sailor abandons the patna and meets marlow who in another novel meets kurtz in the congo",
    "thus she leaves her husband and child for aleksei vronsky but all ends sadly when she leaps in front of a train",
    "we looked out at the setting sun .",
    " i went to the kitchen .",
    "how are you doing ?"
    ] |>map(split,_)


In [None]:
corpus_vocab = @pipe corpus |> map(Set,_) |> reduce(union!,_)
union!(corpus_vocab, bonus_corpus_vocab)
corpus_vocab |> length

In [None]:
############## Books Corpus
corpus = Vector{Symbol}[]

corpus_files = ["../../Resources/corpora/"*fn for fn in ["Books/books_large_p1.txt", "Books/books_large_p2.txt",]]
for fn in corpus_files
    open(fn) do corpus_fh
        for sent in eachline(corpus_fh)
            push!(corpus,[symbol(lowercase(word)) for word in split(sent)])
        end
    end
end



In [None]:
open("books_corpus.jsz","w") do fh
    serialize(fh, corpus)
end

In [None]:
using JLD

In [None]:
@save("books_corpus.jld", corpus)

In [None]:
corpus_vocab = Set{Symbol}()
for sent in corpus 
    union!(corpus_vocab, sent)
end
corpus_vocab = Set{ASCIIString}(map(string,corpus_vocab))
corpus_vocab |> length

In [None]:
using WordEmbeddings
LL, word_indexes, indexed_words =
    #load_word2vec_embeddings("word_emb_data/GoogleNews-vectors-negative300.bin", length(corpus_vocab), corpus_vocab);
    load_embeddings("word_embeddings/glove.6B.300d.txt", length(corpus_vocab), corpus_vocab);

In [None]:
unknown_words = setdiff(corpus_vocab,indexed_words)


In [None]:
#Kind of the opposite of a stop word. This word has little meaning (So zero value), but much structural importance
forcewords = ["and", "a", "of", "to", "the"]
markers = [START_MARKER1,START_MARKER2,END_MARKER1,END_MARKER2]
added_cols = 0
for word in [forcewords..., markers...]
    if !(word in indexed_words)
        added_cols+=1
        push!(indexed_words, word)
        word_indexes[word] = length(indexed_words)
    end
end
LL = [LL zeros(size(LL,1),length(added_cols))]
    
zeroed_words = forcewords[Bool[!haskey(word_indexes,word) for word in forcewords]]

In [None]:
known_vocab = Set(indexed_words)
known_corpus = filter(corpus) do sent
    for word in sent
        if !(string(word) in known_vocab)
            return false
        end
    end
    true
end;
length(known_corpus)

In [None]:
known_corpus

In [None]:
rand()

In [None]:
test_indexes = rand(length(known_corpus)).>0.9

In [None]:
open("train_books_corpus.jsz","w") do fh
    serialize(fh, known_corpus[!test_indexes])
end

In [None]:
data_sym = Dict{ByteString,Any}()
data_sym["LL"]= data["LL"]
data_sym["indexed_words"] = map(Symbol, data["indexed_words"])
data_sym["word_indexes"] = [Symbol(word)=>index for (word,index) in data["word_indexes"]]

In [None]:
using JLD
jldopen("results/data/books300d.jld", "w") do file
    #write(file, "corpus", known_corpus)
    
    @write file LL
    @write file word_indexes
    @write file indexed_words
    
end

In [None]:
################LOADING Done, Now processing

In [None]:
train_corpus = open("results/data/books/train_books_corpus.jsz", "r") do fh
    deserialize(fh)
end


In [None]:
open("results/data/books/train_books_corpus.txt", "w") do fh
    for sent in train_corpus
        write(fh, string(START_MARKER1)*" "*string(START_MARKER2)*" ")
        write(fh, join(sent, " "))
        write(fh, " "*string(END_MARKER1)*" "*string(END_MARKER2)*"\n")
    end    
end

In [None]:
length(train_corpus)

In [None]:
#################
#Old Processing (internally done langyage modelling)

In [None]:
test_corpus = known_corpus[1:end÷10]
train_corpus = known_corpus[1+end÷10:end]

In [None]:
using DataStructures

function Base.sum(acc::Accumulator)
    sum(values(acc.map))
end

function Base.sum(acc::Dict)
    sum(values(acc))
end

function freq2prob{T,V<:Number}(acc::Union{Accumulator{T,V},Dict{T,V}})
    
    ret=Dict{T,Float64}()
    total = sum(acc)
    for (k,v) in acc
        ret[k]=v/total
    end
    ret
end

function dict2mat(bigrams::Dict, word_indexes::Dict{AbstractString,Int64}, dense=False)
    mat  = (dense ? zeros: spzeros)(length(word_indexes),length(word_indexes))
    for first in keys(bigrams)
        for second in keys(bigrams[first])
            mat[word_indexes[second], word_indexes[first]] = bigrams[first][second]
        end
    end
    mat
end


In [None]:
function collect_cooccur_stats(sentences)
    unioccur = counter(AbstractString)
    bioccur = DefaultDict(()->counter(AbstractString))
    
    for sent in sentences
        for ii in 1:length(sent)
            push!(unioccur, sent[ii])
            for jj in 1:length(sent)
                if ii==jj
                    continue
                end
                push!(bioccur[sent[ii]], sent[jj])    
            end
        end       
    end
    
    [k=>v.map for (k,v) in bioccur], unioccur.map
    
end

bioccur_freq, unioccur_freq = collect_cooccur_stats(train_corpus)

bioccur_mat = dict2mat(bioccur_freq,word_indexes,true)
bioccur_mat.+=1.0 # Add one smoothing
bioccur_mat./=sum(bioccur_mat)

unioccur = freq2prob(unioccur_freq)
unioccur_vec = Float64[haskey(unioccur,word) ? unioccur[word] : 0.0 for word in indexed_words]

unioccur_vec_smoothed = Float64[word in keys(unioccur_freq) ? unioccur_freq[word] : 0.0 for word in indexed_words]
unioccur_vec_smoothed.+=1.0 # Add one smoothing
unioccur_vec_smoothed./=sum(unioccur_vec_smoothed)

In [None]:
using Gadfly
using Distributions

In [None]:
sent_lengths = map(length, train_corpus)
plot(x=sent_lengths, Geom.histogram)

In [None]:
sent_length_dist = fit_mle(Gamma, sent_lengths)
plot(x=[round(rand(sent_length_dist)) for _ in 1:length(sent_lengths)], Geom.histogram)

In [None]:
length_prob = cdf(sent_length_dist, collect(1.5:1.0:50.5)) - cdf(sent_length_dist, collect(0.5:1.0:49.5))
length_prob./=sum(length_prob)

In [None]:
using PyCall
#http://www.nltk.org/howto/probability.html
@pyimport nltk
@pyimport nltk.probability as nltk_prob

In [None]:
function trigram_buffer(sent)
    [START_MARKER1, START_MARKER2, sent..., END_MARKER1, END_MARKER2] 
end

train_trigrams = vcat([collect(nltk.trigrams(trigram_buffer(sent))) for sent in train_corpus]...)
#tfd = pycall(nltk_prob.FreqDist, PyObject, all_trigrams)
#kn_prob_dist = nltk_prob.KneserNeyProbDist(tfd)

In [None]:
filename = "atis_data2.jsz"
#filename = "doctors_letters.jsz"
#filename = "brown_data.jsz"

open(filename,"w") do fh
    data = Dict([
        ("trigrams", train_trigrams),
        ("length_prob", length_prob),
        ("LL",LL),
        ("word_indexes", word_indexes),
        ("indexed_words", indexed_words),
        ("test_set", test_corpus),
        ("zeroed_words", zeroed_words)
        ])
    serialize(fh, data)    
end

In [None]:
function likelyhood(sent, bigrams)
    words = split(sent)
    words = [START_MARKER; words; END_MARKER]
    
    p=1.0
    for ii in 1:length(words)-1
        p*=bigrams[words[ii]][words[ii+1]]
    end
    p
end


function select_word{S<:AbstractString,V}(unigrams::Dict{S,V})
    cutoff = rand()
    total = 0.0
    for next_word in keys(unigrams)
        total+=unigrams[next_word]
        if total>=cutoff
            return next_word
        end
    end
    assert(False, "Should never reach here") 
end

function random_walk(bigrams)
    words=[]
    cur = START_MARKER
    while(cur!=END_MARKER)
        cur = select_word(bigrams[cur])
        push!(words,cur)
    end
    words = words[1:end-1]
    join(words, " ")
end

walk =random_walk(kbigrams) 
print(walk*"\t")
print(likelyhood(walk,kbigrams))

In [None]:
LL

Magic Mass-Sharing Co-occurance PMF, Inspired by Bengio 2003
---

In [None]:
using Packing
using Optim
push!(LOAD_PATH, "../Optimisation")
using AdaDelta

In [None]:
addprocs(11)

In [None]:
@everywhere immutable LU_NN{N<:Number}
    C::Matrix{N} #lookup matrix
    H::Matrix{N}
    d::Vector{N}
    U::Matrix{N}
    b::Vector{N}
end

In [None]:
subset_indexes_for_len = Dict{Int,Vector{Vector{Bool}}}()


function prepare_cases(sentence)
    xs = Int[word_indexes[word] for word in sentence]
    K = length(xs)
    ss ::Vector{Vector{Bool}} = if haskey(subset_indexes_for_len,K)
            subset_indexes_for_len[K]
        else 
            subset_indexes_for_len[K] = Vector{Bool}[collect(pr) for pr in  product([(true,false) for k in 1:K]...)][2:end-1]
        end
    
    CT = Tuple{Vector{Int},Vector{Int}}
    #vcat([CT[(xs[bb],[y]) for y in xs[!bb]] for bb in ss]...)
    CT[(xs[bb],xs[!bb]) for bb in ss]
    
end

In [None]:
training_cases = @pipe known_corpus |> map(prepare_cases,_) |> vcat(_...)

In [None]:
function LU_NN(dEmb, dHidden, dOut)
    LU_NN(
        0.01*(randn((dEmb,dOut))), #C
        0.01*(randn((dHidden,dEmb))), #H
        0.01*(randn(dHidden)), #d
        0.01*(randn((dOut,dHidden))), #U
        0.01*(randn(dOut))#b
    )
end

In [None]:
@everywhere function softmax(xs)
    numer = exp(xs)
    numer./sum(numer)
end


@everywhere function σ(xs)
    1.0./(1+exp(-xs))
end

#########################################



@everywhere function δ(δ_above, W)
    (W'*δ_above)
end

@everywhere function δ_tanh(δ_above, a)
    const dz = 1-a.^2
    δ_above.*dz
end

@everywhere function δ_σ(δ_above, a)
    const dz = a .- a.^2
    δ_above.*dz
end

@everywhere function δ_softmax_ce(actual, expected)
    actual-expected
end

@everywhere function δ_output_sq_loss(actual, expected) 
    -(expected-actual)
end

#############################################

@everywhere function sq_loss(actual, expected)
    0.5*sum((expected-actual).^2)
end

@everywhere function ce_loss(actual, expected)
    @assert(all(actual.>0))
    -sum(expected.*log(actual))
end

@everywhere function forward(x,nn::LU_NN)
    aa = tanh(nn.H*x+nn.d)
    out = softmax(nn.U*aa + nn.b)
    out, aa
end

@everywhere function feedforward_backprop(xx,nn::LU_NN, expected_output)
    actual_output, aa = forward(xx,nn)
    err = ce_loss(actual_output, expected_output)
    #δ_top = δ_tanh(δ_softmax_ce(actual_output, expected_output),actual_output)
    δ_top = δ_softmax_ce(actual_output, expected_output)
    ΔU  = δ_top*aa'
    Δb  = δ_top
    δ_hidden = δ_tanh(δ(δ_top, nn.U),aa)
    ΔH  = δ_hidden*xx'
    Δd  = δ_hidden
    δ_bottom = δ(δ_hidden, nn.H)
    Δx  = δ_bottom
    Δx,ΔH,Δd,ΔU,Δb,err

end

In [None]:
@everywhere function mysubarray(xs, id=myid(), nchunks=nworkers())
    #THis is broken if workers are not sequentially numbered from 2:nworkers
    len = length(xs)
    chunk_size = div(len, nchunks+1)
    start_index = (id-2)*chunk_size + 1
    end_index = start_index+chunk_size-1
    print(start_index : end_index)
    sub(xs, start_index : end_index)
end

@everywhere function expand_training_case(given_iis, target_iis, nn)
    given_sowe = length(given_iis)>0 ? sum([nn.C[:,g_ii] for g_ii in given_iis]) : zeros(nn.C[:,1])
    target = zeros(nn.b) #just while we are testing use a one hot set rep
    for t_ii in target_iis
        target+=1.0/length(target_iis)
    end
    given_sowe,target
end

@everywhere function train_one(given_iis, target_iis, nn)
    given_sowe,target = expand_training_case(given_iis, target_iis, nn)
        
    Δx,ΔH,Δd,ΔU,Δb,err = feedforward_backprop(given_sowe,nn, target)
    
    ΔC = zeros(nn.C)
    for g_ii in given_iis
        @inbounds ΔC[:, g_ii]+=Δx./length(given_iis)
    end
    ΔC,ΔH,Δd,ΔU,Δb,err
end


function train_all(training_cases,nn)
    
    function accumulate_training_over(cases,fun)
        totals = Any[  
            zeros(nn.C),
            zeros(nn.H),
            zeros(nn.d),
            zeros(nn.U),
            zeros(nn.b),
            0.0]
        for case in cases
            Δs = fun(case)
            for ii in 1:length(totals)
                totals[ii]+=Δs[ii]
            end
        end
        totals
    end
    
    function train_remote()
        accumulate_training_over(mysubarray(training_cases),
                                 gt_iis -> train_one(gt_iis[1], gt_iis[2], nn) )
    end
    
    r_updates = [@spawnat(id, train_remote())  for id in workers()]
        
    totals = accumulate_training_over(r_updates, fetch)
    ([tot./length(training_cases) for tot in totals]...)
end
    

In [None]:
function uncached_loss_and_loss_grad!(θ::Vector, grad::Vector)    
    unpack!(θ, nn_outer)
    train_results = train_all(training_cases, nn_outer)
    Δs = train_results[1:end-1]
    err = train_results[end]
    pack!(grad, Δs...)
    err+0.01*sum(θ.^2)
end

_loss_and_loss_grad=Dict{Vector{Float64},Tuple{Float64, Vector{Float64}}}()
function loss_and_loss_grad!(θ::Vector, grad::Vector)    
    if haskey(_loss_and_loss_grad,θ)
        err, grad[:]= _loss_and_loss_grad[θ]
    else
        err = uncached_loss_and_loss_grad!(θ, grad)
        _loss_and_loss_grad[θ] = (err, copy(grad))
    end
    err
end

function loss!(θ::Vector)  
    global nn_outer
    unpack!(θ, nn_outer)
    total_loss = @parallel (+) for (g_ii, t_ii) in training_cases
        x, target = expand_training_case(g_ii, t_ii,nn_outer)
        actual, _ = forward(x,nn_outer)
        ce_loss(actual, target)    
    end
    total_loss/length(training_cases)

end

function loss_grad!(θ::Vector, grad::Vector) 
    #warn("loss_grad not defined")
    loss_and_loss_grad!(θ, grad)
end


In [None]:
using MultivariateStats

In [None]:
dEmb = 9
nn_outer = LU_NN(dEmb, 6, length(indexed_words));
nn_outer.b[:] = unioccur_vec;

nn_outer.C[:] = transform(fit(PCA, LL; maxoutdim=dEmb), LL) |> vec


θ=pack(nn_outer)

In [None]:
loss!(θ)

In [None]:
opt_func = DifferentiableFunction(loss!,loss_grad!,loss_and_loss_grad!)

@time res = optimize(opt_func, θ, method=:l_bfgs, show_trace = true, store_trace = true, iterations = 10);
#@time res = adadelta(opt_func, θ, show_trace = true, iterations = 500);
@printval res.f_calls 
@printval res.g_calls 
@printval res.iterations
@printval res.f_minimum
@printval res.gr_converged
@printval res.x_converged                       
@printval res.f_converged 


In [None]:
sowe = nn_outer.C[:,word_indexes["find"]]
ns,_ = forward(sowe,nn_outer)
indexed_words[findmax(ns)[2]]

In [None]:
@printval res.gr_converged
@printval res.x_converged                       
@printval res.f_converged 


In [None]:
ns

In [None]:
var(bb)

In [None]:
var(WW)

In [None]:
known_corpus[end-2]

In [None]:
LL[:,word_indexes["first"]] + LL[:,word_indexes["class"]]

In [None]:
nn_outer = LU_NN(8, 16, 19);

xx=sum([nn_outer.C[:,g_ii] for g_ii in [1,2,3]])
target = zeros(nn_outer.b)
target[[10,11,12]]=1.0/3


using ForwardDiff

function f(θ)
    x,H,d,U,b = unpack(θ,size(xx),(16,8),16,(19,16),19) 
    nn =  LU_NN(0.0*H, H,d,U,b)#using 0.0*H as a dummy value for C
    
    actual, _=forward(x,nn)
    sq_loss(actual, target)
end

function calc_ag(θ)
    x,H,d,U,b = unpack(θ,size(xx),(16,8),16,(19,16),19)
    nn =  LU_NN(0.0*H, H,d,U,b) #using 0.0*H as a dummy value for C
    Δx,ΔH,Δd,ΔU,Δb,err = feedforward_backprop(x,nn, target)
end

g = ForwardDiff.gradient(f)

t=pack(xx,nn_outer.H,nn_outer.d,nn_outer.U,nn_outer.b)
a_x, a_H, a_d, a_U, a_b, a_err = calc_ag(t)
dg = g(t)  #Commented out so can't be run  
d_x, d_H, d_d, d_U, d_b = unpack(dg,size(xx),(16,8),16,(19,16),19)


@printval f(t) == a_err
@printval findmax(abs(d_x .- a_x))
@printval findmax(abs(d_H .- a_H))
@printval findmax(abs(d_d .- a_d))
@printval findmax(abs(d_U .- a_U))
@printval findmax(abs(d_b .- a_b))

In [None]:
[a_err f(t)]

In [None]:
f(t)

In [None]:
a_err