In [1]:
using Iterators
using DataStructures
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util")

4-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              
 "../util"                                                        

In [2]:
using Packing

In [3]:
using DataStructures

function Base.sum(acc::Accumulator)
    sum(values(acc.map))
end

function Base.sum(acc::Dict)
    sum(values(acc))
end

function freq2prob{T,V<:Number}(acc::Union{Accumulator{T,V},Dict{T,V}})
    
    ret=Dict{T,Float64}()
    total = sum(acc)
    for (k,v) in acc
        ret[k]=v/total
    end
    ret
end

freq2prob (generic function with 1 method)

In [4]:
function clean(ss)
    @pipe (ss 
    |> replace(_, r"\[.*?\] ?","")  #Remove Nonword sounds
    |> replace(_, r"\<.*?\> ","")  #Remove Verbal Deltions
    |> replace(_, r"\*(.*?)\*",s"\1") #Remove mispronounciation marks
    |> replace(_,r"\:|\-\s\.\s\-", "") #remove intraword pauses
    |> replace(_,r"\w+\- ","") #remove stuttered words
    |> replace(_, r"[!\.,\?]","")        #Remove punctation as it is not used traditionally (see sro spec)
    
    |> replace(_, r"\s+",' ') #Remove repeated spaces
    #|> replace(_, r"([A-Z])\s([A-Z])", s"\1\2") #Merge len(2) abbrev
    |> replace(_, r".*[~\(\)\-\<\>#'].*","")#Remove everything if anything unfixable found
    |> lowercase
    |> strip 
    )
end


clean (generic function with 1 method)

In [5]:
const START_MARKER = "**START**"
const END_MARKER = "**END**"

"**END**"

In [6]:
path="../../Resources/corpora/atis2_text/"
function valid(ss)
    typeof(ss) <: ASCIIString  && length(ss)>0
end
    
corpus = @pipe readdir(path)[1:4] |> filter!(fn -> splitext(fn)[2]==".sro", _) |> map(_) do fn
    try open(readall, path*fn) end
    end |> filter!(valid,_) |> map(clean,_) |> filter!(valid,_) |> map(s->split(s),_);
corpus_vocab = @pipe corpus |> map(Set,_) |> reduce(union,_)
length(corpus_vocab)

14

In [7]:
using WordEmbeddings
LL, word_indexes, indexed_words = load_word2vec_embeddings("word_emb_data/GoogleNews-vectors-negative300.bin", length(corpus_vocab), corpus_vocab);

In [8]:
setdiff(corpus_vocab,indexed_words)


1-element Array{UTF8String,1}:
 "to"

In [9]:
#Kind of the opposite of a stop word. This word has little meaning (So zero value), but much structural importance
forcewords = ["and", "a", "of", "to"]
for word in forcewords
    @assert(!(word in indexed_words))
    push!(indexed_words, word)
    word_indexes[word] = length(indexed_words)
end
LL = [LL zeros(size(LL,1),length(forcewords))]

300x17 Array{Float64,2}:
  0.00374603   0.0746853    0.0348454    …  -0.0964167   0.0  0.0  0.0  0.0
 -0.0389198    0.0979106   -0.00010466       0.0181234   0.0  0.0  0.0  0.0
  0.0913317    0.0464506    0.0456797        0.0547328   0.0  0.0  0.0  0.0
  0.0120003    0.0498661    0.0319173        0.0739436   0.0  0.0  0.0  0.0
 -0.0705745   -0.062845     0.0150802       -0.026279    0.0  0.0  0.0  0.0
  0.105343    -0.112483     0.012518     …  -0.0349782   0.0  0.0  0.0  0.0
  0.0599364    0.0327887   -0.0415803       -0.104391    0.0  0.0  0.0  0.0
 -0.0573418   -0.110662    -0.0796467        0.0424088   0.0  0.0  0.0  0.0
  0.0381414    0.0409858   -0.0415803        0.0025033   0.0  0.0  0.0  0.0
  0.0110921    0.0281208    0.0983872        0.00602604  0.0  0.0  0.0  0.0
 -0.0653852   -0.0530539   -0.0629561    …  -0.0424088   0.0  0.0  0.0  0.0
 -0.0313953   -0.0710421   -0.0480223       -0.0366093   0.0  0.0  0.0  0.0
  0.0537093    0.0120111   -0.0325029       -0.131939    0.0  0

In [10]:
@assert(!(START_MARKER in indexed_words))
push!(indexed_words, START_MARKER)
word_indexes[START_MARKER] = length(indexed_words)

@assert(!(END_MARKER in indexed_words))
push!(indexed_words, END_MARKER)
word_indexes[END_MARKER] = length(indexed_words)
LL = [LL zeros(size(LL,1),2)]

300x19 Array{Float64,2}:
  0.00374603   0.0746853    0.0348454    …  0.0  0.0  0.0  0.0  0.0  0.0
 -0.0389198    0.0979106   -0.00010466      0.0  0.0  0.0  0.0  0.0  0.0
  0.0913317    0.0464506    0.0456797       0.0  0.0  0.0  0.0  0.0  0.0
  0.0120003    0.0498661    0.0319173       0.0  0.0  0.0  0.0  0.0  0.0
 -0.0705745   -0.062845     0.0150802       0.0  0.0  0.0  0.0  0.0  0.0
  0.105343    -0.112483     0.012518     …  0.0  0.0  0.0  0.0  0.0  0.0
  0.0599364    0.0327887   -0.0415803       0.0  0.0  0.0  0.0  0.0  0.0
 -0.0573418   -0.110662    -0.0796467       0.0  0.0  0.0  0.0  0.0  0.0
  0.0381414    0.0409858   -0.0415803       0.0  0.0  0.0  0.0  0.0  0.0
  0.0110921    0.0281208    0.0983872       0.0  0.0  0.0  0.0  0.0  0.0
 -0.0653852   -0.0530539   -0.0629561    …  0.0  0.0  0.0  0.0  0.0  0.0
 -0.0313953   -0.0710421   -0.0480223       0.0  0.0  0.0  0.0  0.0  0.0
  0.0537093    0.0120111   -0.0325029       0.0  0.0  0.0  0.0  0.0  0.0
  ⋮                       

In [11]:
known_vocab = Set(indexed_words)
known_corpus = filter(corpus) do sent
    for word in sent
        if !( word in known_vocab)
            return false
        end
    end
    true
end;

In [None]:
################LOADING Done, Now processing

In [None]:


function collect_grams_stats(sentences)
    unigrams = counter(AbstractString)
    bigrams = DefaultDict(()->counter(AbstractString))
    
    for sent in sentences
        push!(bigrams[START_MARKER], sent[1])
        for ii in 1:length(sent)-1
            push!(unigrams,sent[ii])
            push!(bigrams[sent[ii]], sent[ii+1])    
        end
        push!(unigrams,sent[end])
        push!(bigrams[sent[end]], END_MARKER)
    end
    
    
    [k=>v.map for (k,v) in bigrams], unigrams.map
end




In [None]:
using StatsBase
# modified from https://github.com/JoFrhwld/GoodTuring.jl/blob/master/GoodTuring.jl
function simpleGoodTuring(speciesCountDict::Dict)
    speciesCountVec = collect(values(speciesCountDict))
        
    totalCounts = sum(speciesCountVec)
    cofcDict = countmap(speciesCountVec)
    r = sort(collect(keys(cofcDict)))

    N = size(r,1)
    Nr = [cofcDict[r[i]] for i in 1:N]

    p0 = haskey(cofcDict, 1.0) ? cofcDict[1.0] / totalCounts : 0.0
    
    Z = sgtZ(r,Nr)
    logr = map(log,r)
    logZ = map(log,Z)

    X = hcat(ones(N), logr)
    Y = copy(logZ )
    coefs = X\Y
    intercept = coefs[1]
    slope = coefs[2]

    useY = false
    rSmooth = Array{Float64}(N)
    for i in 1:N
        @inbounds thisr = r[i]
        
        #y = ((thisr+1.0)^(slope+1.0))/(thisr^slope)
        #The above is the much simplified form of the below (Performance identical output differs by 10^-16)
        y = (thisr+1.0) * exp(slope * log(thisr+1.0) + intercept) / exp(slope * log(thisr) + intercept)

        if !in(thisr+1, r)
            useY = true
        end

        if useY
            rSmooth[i] = y
        else
            x = (thisr+1) * cofcDict[thisr + 1]/cofcDict[thisr]
            thisNr = cofcDict[thisr]
            thisNr1 = cofcDict[thisr+1]

            t = 1.96 * ((thisr+1)^2) * (thisNr1 / thisNr^2) * (1 + (thisNr1 / thisNr))

            if abs(x-y) > t
                @inbounds rSmooth[i] = x
            else
                useY = true
                @inbounds rSmooth[i] = y
            end
        end
    end

    smoothTot = sum(Nr.*rSmooth)
    sgtProb  = (1.0 - p0) .* (rSmooth/smoothTot)
    sgtProbDict = Dict([r[i] => sgtProb[i] for i in 1:N])
    sgtDict = Dict([sp=>sgtProbDict[speciesCountDict[sp]] for sp in keys(speciesCountDict)])

    sgtDict, sgtProbDict, p0
end


function sgtZ(r::Array, Nr::Array)
    j = r
    i = [0; j[1:end-1]]
    lastK = 2*j[end] - i[end]
    k = [j[2:end]; lastK]
    Float64[(2*Nr[iter])/(k[iter]-i[iter]) for iter = 1:length(j)]
end

function simpleGoodTuring(speciesCountVec::Vector)
    sgtD = simpleGoodTuring(Dict([ii=>v for (ii,vv) in enumerate(speciesCountVec)]))
    [sgtD[ii] for ii in 1:length(speciesCountVec)]
end

In [None]:
function katz_bigrams(bigram_freq::Dict, unigrams_freq::Dict)
    k_bigrams = Dict()
    
    for first in keys(bigram_freq)
        smoothed,_,p0 = simpleGoodTuring(bigram_freq[first])
        k_bigrams[first] = smoothed
        
        backoff_keys = setdiff(keys(unigram_freq),keys(smoothed))
        #share the p0 proability mass between them
        total = sum([unigrams_freq[key] for key in backoff_keys])
        for second in backoff_keys
            k_bigrams[first][second]=p0.*unigrams_freq[second]./total
        end
    end
    k_bigrams
end

In [None]:
Vector

In [None]:
function dict2mat(bigrams::Dict, word_indexes::Dict{AbstractString,Int64}, dense=False)
    mat  = (dense ? zeros: spzeros)(length(word_indexes),length(word_indexes))
    for first in keys(bigrams)
        for second in keys(bigrams[first])
            mat[word_indexes[second], word_indexes[first]] = bigrams[first][second]
        end
    end
    mat
end

In [None]:
bigram_freq, unigram_freq = collect_grams_stats(known_corpus);
kbigrams=katz_bigrams(bigram_freq, unigram_freq)
kbigrams_mat = dict2mat(kbigrams,word_indexes,true)


In [None]:
using Gadfly
using Distributions

In [None]:
sent_lengths = map(length, known_corpus)
plot(x=sent_lengths, Geom.histogram)

In [None]:
sent_length_dist = fit_mle(Gamma, sent_lengths)
plot(x=[round(rand(sent_length_dist)) for _ in 1:length(sent_lengths)], Geom.histogram)

In [None]:
length_prob=cdf(sent_length_dist,[1.5:1.0:50.5])-cdf(sent_length_dist,[0.5:1.0:49.5])


In [None]:
function collect_cooccur_stats(sentences)
    unioccur = counter(AbstractString)
    bioccur = DefaultDict(()->counter(AbstractString))
    
    for sent in sentences
        for ii in 1:length(sent)
            push!(unioccur, sent[ii])
            for jj in 1:length(sent)
                if ii==jj
                    continue
                end
                push!(bioccur[sent[ii]], sent[jj])    
            end
        end       
    end
    
    [k=>v.map for (k,v) in bioccur], unioccur.map
    
end

In [None]:
bioccur_freq, unioccur_freq = collect_cooccur_stats(known_corpus)

bioccur_mat = dict2mat(bioccur_freq,word_indexes,true)
bioccur_mat.+=1.0 # Add one smoothing
bioccur_mat./=sum(bioccur_mat)

unioccur = freq2prob(unioccur_freq)
unioccur_vec = Float64[word in keys(unioccurs) ? unioccurs[word] : 0.0 for word in indexed_words]

In [None]:
unioccur_vec_smoothed = Float64[word in keys(unioccur_freq) ? unioccur_freq[word] : 0.0 for word in indexed_words]
unioccur_vec_smoothed.+=1.0 # Add one smoothing
unioccur_vec_smoothed./=sum(unioccur_vec_smoothed)

In [None]:
sum(bioccur_mat,2)

In [None]:
open("atis_data.jsz","w") do fh
    data = Dict([
        ("bigrams", kbigrams_mat),
        ("bioccur", bioccur_mat),
        ("unioccur", unioccur_vec),
        
        ("length_prob", length_prob),
        ("LL",LL),
        ("word_indexes", word_indexes),
        ("indexed_words", indexed_words),
        ])
    serialize(fh, data)    
end

In [None]:
function likelyhood(sent, bigrams)
    words = split(sent)
    words = [START_MARKER; words; END_MARKER]
    
    p=1.0
    for ii in 1:length(words)-1
        p*=bigrams[words[ii]][words[ii+1]]
    end
    p
end


function select_word{S<:AbstractString,V}(unigrams::Dict{S,V})
    cutoff = rand()
    total = 0.0
    for next_word in keys(unigrams)
        total+=unigrams[next_word]
        if total>=cutoff
            return next_word
        end
    end
    assert(False, "Should never reach here") 
end

function random_walk(bigrams)
    words=[]
    cur = START_MARKER
    while(cur!=END_MARKER)
        cur = select_word(bigrams[cur])
        push!(words,cur)
    end
    words = words[1:end-1]
    join(words, " ")
end

walk =random_walk(kbigrams) 
print(walk*"\t")
print(likelyhood(walk,kbigrams))

In [None]:
LL

Frame's Magic Mass-Sharing Co-occurance PMF, Inspired by Bengio 2003
---

In [None]:
addprocs(11)

In [12]:
function prepare_cases(sentence)
    Task() do
        sentence_iis = Int[word_indexes[word] for word in sentence]
        for n_givens in 1:length(sentence_iis)
            given_prob = 1.0/(n_givens)
            given_keep = rand(length(sentence_iis)).<given_prob
            given_iis::Vector{Int} =  sentence_iis[given_keep]
            cooccur_iis::Vector{Int} =  sentence_iis[~given_keep]
            produce(given_iis, cooccur_iis)
        end   
    end
end

prepare_cases (generic function with 1 method)

In [104]:
@everywhere function loss(actual, expected)
    sum(0.5*(expected-actual).^2)
end

@everywhere function forward(x,W,b)
    tanh(W*x+b)
end


@everywhere function δ(δ_above, W)
    (W'*δ_above)
end


@everywhere function δ_output(actual, expected) 
    #Output Layer
    const dz = 1-actual.^2
    const δ_above = -(expected-actual)
    δ_above.*dz
end


@everywhere function feedforward_backprop(x,W,b, expected_output)
    actual_output = forward(x,W,b)
    err = loss(actual_output, expected_output)
    
    δ_top = δ_output(actual_output, expected_output)
    ΔW  = δ_top*x'
    Δb  = δ_top
    δ_bottom = δ(δ_top, W)
    Δx  = δ_bottom
    Δx,ΔW,Δb,err
end



In [105]:
training_case_type = Tuple{Vector{Int64},Vector{Int64}}
training_cases = @pipe chain(map(prepare_cases, known_corpus)...)|> collect(training_case_type,_)

dEmb = 60
CC = 0.01*randn((dEmb, length(indexed_words)))
WW = 0.01*randn((length(indexed_words),dEmb))
bb = 0.001*randn((length(indexed_words)));


In [106]:
using ForwardDiff

function f(θ)
    actual = forward(unpack(θ,size(xx),size(WW),size(bb))...)
    loss(actual, target)
end

function calc_ag(θ)
    Δx,ΔW,Δb,err = feedforward_backprop(unpack(θ,size(xx),size(WW),size(bb))..., target)
end

g = ForwardDiff.gradient(f)

g (generic function with 1 method)

In [107]:
xx=sum([CC[:,g_ii] for g_ii in [1,2,3]])
target = zeros(bb)
target[[10,11,12]]=1.0


1.0

In [108]:
t=pack(xx,WW,bb)
dg = g(t)
dxg,dWg,dbg = unpack(dg,size(xx),size(WW),size(bb));

In [109]:
xx

60-element Array{Float64,1}:
 -0.0177167  
  0.0122861  
 -0.0158791  
 -0.00724171 
  0.0123465  
  0.000929592
 -0.00882174 
  0.000146004
  0.0185064  
  0.0185556  
  0.0175436  
 -0.0151407  
  0.00792393 
  ⋮          
 -0.00320848 
  0.00928898 
 -0.0170634  
  0.011532   
 -0.00959649 
 -0.0217515  
  0.00173057 
 -0.00970517 
  0.0216696  
  0.0147313  
 -0.0104509  
  0.00730887 

In [110]:
axg,aWg,abg,a_err = calc_ag(t)

([-0.0219776,0.00116707,-0.0248392,0.00408607,0.0126314,0.000491081,0.0261137,0.0240766,0.0170245,0.0120003  …  0.00632649,-0.0191959,0.00181156,-0.00988777,0.00265303,0.0199349,0.00601105,-0.00931994,0.00456385,0.00307503],
19x60 Array{Float64,2}:
  2.14185e-5  -1.48532e-5   1.9197e-5   …   1.26346e-5  -8.836e-6  
 -2.27823e-5   1.57989e-5  -2.04193e-5     -1.34391e-5   9.39863e-6
  2.10763e-5  -1.46158e-5   1.88902e-5      1.24327e-5  -8.69483e-6
 -8.33888e-6   5.78279e-6  -7.47396e-6     -4.91903e-6   3.44013e-6
 -3.16561e-5   2.19527e-5  -2.83727e-5     -1.86737e-5   1.30594e-5
  3.39399e-5  -2.35364e-5   3.04196e-5  …   2.00208e-5  -1.40016e-5
 -7.98472e-7   5.53718e-7  -7.15653e-7     -4.71011e-7   3.29402e-7
  2.00946e-5  -1.39351e-5   1.80104e-5      1.18536e-5  -8.28984e-6
 -5.37954e-7   3.73057e-7  -4.82157e-7     -3.17334e-7   2.21928e-7
  0.0177154   -0.0122851    0.0158779       0.0104501   -0.00730831
  0.0177089   -0.0122806    0.0158721   …   0.0104463   -0.00730564
  0

In [111]:
f(t) == a_err

true

In [112]:
findmax(abs(dxg.-axg))

(6.938893903907228e-18,3)

In [113]:
findmax(abs(dWg.-aWg))

(1.3877787807814457e-17,372)

In [114]:
findmax(abs(dbg.-abg))

(1.1102230246251565e-16,10)

In [94]:
[dxg./x axg./x (dxg-axg)./x]*100

LoadError: LoadError: UndefVarError: x not defined
while loading In[94], in expression starting on line 1

In [None]:
[abs(dxg.-axg) x]

In [None]:
@everywhere function mysubarray(xs, id=myid(), nchunks=nworkers())
    len = length(xs)
    chunk_size = div(len, nchunks+1)
    start_index = (id-2)*chunk_size + 1
    end_index = start_index+chunk_size-1
    print(start_index : end_index)
    sub(xs, start_index : end_index)
end

@everywhere function train_one(given_iis, target_iis, C, W, b)
    given_sowe = length(given_iis)>0 ? sum([C[:,g_ii] for g_ii in given_iis]) : zeros(C[:,1])
    target = zeros(b) #just while we are testing use a one hot set rep
    for t_ii in target_iis
        target+=1.0/length(target_iis)
    end
        
    Δx,ΔW,Δb, err= feedforward_backprop(given_sowe,W,b, target)
    
    ΔC = zeros(C)
    for g_ii in given_iis
        ΔC[:, g_ii]+=Δx/length(given_iis)
    end
    ΔC,ΔW,Δb, err
end
    


function train_all(training_cases,C, W, b)
    
    function train_remote()
        total_ΔC=zeros(C)
        total_ΔW=zeros(W)
        total_Δb=zeros(b)
        total_err = 0.0
        for (g_iis, t_iis) in mysubarray(training_cases)
            ΔC, ΔW, Δb,err = train_one(g_iis, t_iis, C, W, b)
            @inbounds total_ΔC+=ΔC
            @inbounds total_ΔW+=ΔW
            @inbounds total_Δb+=Δb
            total_err+=err
        end

        total_ΔC, total_ΔW, total_Δb, total_err
    end
    
    r_updates = [@spawnat(id, train_remote())  for id in workers()]
        
    
    total_ΔC=zeros(C)
    total_ΔW=zeros(W)
    total_Δb=zeros(b)
    total_err = 0.0
    for r_update in r_updates
        ΔC, ΔW, Δb, err = fetch(r_update)
        @inbounds total_ΔC+=ΔC
        @inbounds total_ΔW+=ΔW
        @inbounds total_Δb+=Δb
        total_err+=err
    end
    total_ΔC./length(training_cases),
    total_ΔW./length(training_cases),
    total_Δb./length(training_cases),
    total_err./length(training_cases)
        
end
    

In [None]:
function loss_and_loss_grad!(θ::Vector, grad::Vector)    
    C, W, b = unpack!(θ, CC,WW,bb)
    ΔC, ΔW, Δb, err = train_all(training_cases, C, W, b )
    pack!(grad, ΔC, ΔW, Δb)
    err/=length(training_cases)
    err
end

_loss_and_loss_grad=Dict{Vector{Float64},Tuple{}}()
function cached_loss_and_loss_grad!(θ::Vector, grad::Vector)    
    error("Not Defnes")
end

function loss!(θ::Vector)  
    dummy_grad = similar(θ) 
    loss_and_loss_grad!(θ, dummy_grad)
end

function loss_grad!(θ::Vector, storage::Vector) 
    #warn("loss_grad not defined")
    loss_and_loss_grad!(θ, grad)
end


In [None]:
using Optim

opt_func = DifferentiableFunction(loss!,loss_grad!,loss_and_loss_grad!)

@time res = optimize(opt_func, pack(CC,WW,bb), method=:l_bfgs, show_trace = true, store_trace = true, iterations = 200);
@printval res.f_calls 
@printval res.g_calls 
@printval res.iterations
@printval res.f_minimum
@printval res.gr_converged
@printval res.x_converged                       
@printval res.f_converged 


In [None]:
`git commit -m="Fixed gradient, not 100% certain how, I beleive its cos the tanh graient was beign ta