In [1]:
using Iterators
using DataStructures
using Pipe
using Compat

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end

push!(LOAD_PATH, ".")



3-element Array{ByteString,1}:
 "/home/ubuntu/build/julia-master/usr/local/share/julia/site/v0.5"
 "/home/ubuntu/build/julia-master/usr/share/julia/site/v0.5"      
 "."                                                              

In [2]:
using DataStructures

function Base.sum(acc::Accumulator)
    sum(values(acc.map))
end

function freq2prob{T,V<:Number}(acc::Accumulator{T,V})
    ret=Dict{T,Float64}()
    total = sum(acc)
    for (k,v) in acc
        ret[k]=v/total
    end
    ret
end

freq2prob (generic function with 1 method)

In [3]:
function clean(ss)
    @pipe (ss 
    |> replace(_, r"\[.*?\] ?","")  #Remove Nonword sounds
    |> replace(_, r"\<.*?\> ","")  #Remove Verbal Deltions
    |> replace(_, r"\*(.*?)\*",s"\1") #Remove mispronounciation marks
    |> replace(_,r"\:|\-\s\.\s\-", "") #remove intraword pauses
    |> replace(_,r"\w+\- ","") #remove stuttered words
    |> replace(_, r"[!\.,\?]","")        #Remove punctation as it is not used traditionally (see sro spec)
    
    |> replace(_, r"\s+",' ') #Remove repeated spaces
    #|> replace(_, r"([A-Z])\s([A-Z])", s"\1\2") #Merge len(2) abbrev
    |> replace(_, r".*[~\(\)\-\<\>#'].*","")#Remove everything if anything unfixable found
    |> lowercase
    |> strip 
    )
end


clean (generic function with 1 method)

In [4]:
const START_MARKER = "**START**"
const END_MARKER = "**END**"

function collect_grams_stats(sentences)
    unigrams = counter(AbstractString)
    bigrams = DefaultDict(()->counter(AbstractString))
    
    for sent in sentences
        push!(bigrams[START_MARKER], sent[1])
        for ii in 1:length(sent)-1
            push!(unigrams,sent[ii])
            push!(bigrams[sent[ii]], sent[ii+1])    
        end
        push!(unigrams,sent[end])
        push!(bigrams[sent[end]], END_MARKER)
    end
    
    
    [k=>v.map for (k,v) in bigrams], unigrams.map
end




collect_grams_stats (generic function with 2 methods)

In [5]:
path="../../Resources/corpora/atis2_text/"
function valid(ss)
    typeof(ss) <: ASCIIString  && length(ss)>0
end
    
corpus = @pipe readdir(path) |> filter!(fn -> splitext(fn)[2]==".sro", _) |> map(_) do fn
    try open(readall, path*fn) end
    end |> filter!(valid,_) |> map(clean,_) |> filter!(valid,_) |> map(s->split(s),_);
corpus_vocab = @pipe corpus |> map(Set,_) |> reduce(union,_)
length(corpus_vocab)

1131

In [6]:
using WordEmbeddings
LL, word_indexes, indexed_words = load_word2vec_embeddings("word_emb_data/GoogleNews-vectors-negative300.bin", length(corpus_vocab), corpus_vocab);

In [7]:
@assert(!(START_MARKER in indexed_words))
push!(indexed_words, START_MARKER)
word_indexes[START_MARKER] = length(indexed_words)

@assert(!(END_MARKER in indexed_words))
push!(indexed_words, END_MARKER)
word_indexes[END_MARKER] = length(indexed_words)
LL = [LL zeros(size(LL,1),2)]

300x1117 Array{Float64,2}:
  0.0529562  -0.00851202  …   0.0656386    -0.0548293   0.0  0.0
  0.0654598  -0.0342245      -0.0860691     0.0239878   0.0  0.0
  0.0661953   0.0322839      -0.00994359   -0.0190492   0.0  0.0
  0.0470722   0.0458679       0.0804181     0.0826471   0.0  0.0
  0.0522207  -0.0131429       0.00717243    0.0620861   0.0  0.0
 -0.0820086  -0.0462207   …   0.0376009     0.0465646   0.0  0.0
 -0.0614145  -0.00094823     -0.0634651    -0.0786156   0.0  0.0
 -0.11621    -0.0522188       0.0436866    -0.0516041   0.0  0.0
  0.0156294   0.0465735      -0.0669426     0.00902063  0.0  0.0
  0.0992929   0.0624509       0.120845      0.0364857   0.0  0.0
 -0.0856861  -0.122785    …   0.00863951    0.0395094   0.0  0.0
 -0.028133   -0.0287556      -0.039557     -0.0588609   0.0  0.0
  0.0522207   0.0515131      -0.126061     -0.0903071   0.0  0.0
  ⋮                       ⋱                             ⋮       
 -0.0318105   0.0294613      -0.0539019    -0.0199563   0.0  0.



In [8]:
known_vocab = Set(indexed_words)
known_corpus = filter(corpus) do sent
    for word in sent
        if !( word in known_vocab)
            return false
        end
    end
    true
end;

In [9]:
using StatsBase
# modified from https://github.com/JoFrhwld/GoodTuring.jl/blob/master/GoodTuring.jl
function simpleGoodTuring(speciesCountDict::Dict)
    speciesCountVec = collect(values(speciesCountDict))
        
    totalCounts = sum(speciesCountVec)
    cofcDict = countmap(speciesCountVec)
    r = sort(collect(keys(cofcDict)))

    N = size(r,1)
    Nr = [cofcDict[r[i]] for i in 1:N]

    p0 = haskey(cofcDict, 1.0) ? cofcDict[1.0] / totalCounts : 0.0
    
    Z = sgtZ(r,Nr)
    logr = map(log,r)
    logZ = map(log,Z)

    X = hcat(ones(N), logr)
    Y = copy(logZ )
    coefs = X\Y
    intercept = coefs[1]
    slope = coefs[2]

    useY = false
    rSmooth = Array{Float64}(N)
    for i in 1:N
        @inbounds thisr = r[i]
        
        #y = ((thisr+1.0)^(slope+1.0))/(thisr^slope)
        #The above is the much simplified form of the below (Performance identical output differs by 10^-16)
        y = (thisr+1.0) * exp(slope * log(thisr+1.0) + intercept) / exp(slope * log(thisr) + intercept)

        if !in(thisr+1, r)
            useY = true
        end

        if useY
            rSmooth[i] = y
        else
            x = (thisr+1) * cofcDict[thisr + 1]/cofcDict[thisr]
            thisNr = cofcDict[thisr]
            thisNr1 = cofcDict[thisr+1]

            t = 1.96 * ((thisr+1)^2) * (thisNr1 / thisNr^2) * (1 + (thisNr1 / thisNr))

            if abs(x-y) > t
                @inbounds rSmooth[i] = x
            else
                useY = true
                @inbounds rSmooth[i] = y
            end
        end
    end

    smoothTot = sum(Nr.*rSmooth)
    sgtProb  = (1.0 - p0) .* (rSmooth/smoothTot)
    sgtProbDict = Dict([r[i] => sgtProb[i] for i in 1:N])
    sgtDict = Dict([sp=>sgtProbDict[speciesCountDict[sp]] for sp in keys(speciesCountDict)])

    sgtDict, sgtProbDict, p0
end


function sgtZ(r::Array, Nr::Array)
    j = r
    i = [0; j[1:end-1]]
    lastK = 2*j[end] - i[end]
    k = [j[2:end]; lastK]
    Float64[(2*Nr[iter])/(k[iter]-i[iter]) for iter = 1:length(j)]
end


sgtZ (generic function with 1 method)

In [10]:

function katz_bigrams(bigram_freq, unigrams_freq)
    k_bigrams = Dict()
    
    for first in keys(bigram_freq)
        smoothed,_,p0 = simpleGoodTuring(bigram_freq[first])
        k_bigrams[first] = smoothed
        
        backoff_keys = setdiff(keys(unigram_freq),keys(smoothed))
        #share the p0 proability mass between them
        total = sum([unigrams_freq[key] for key in backoff_keys])
        for second in backoff_keys
            k_bigrams[first][second]=p0.*unigrams_freq[second]./total
        end
    end
    k_bigrams
end

katz_bigrams (generic function with 1 method)

In [13]:
function likelyhood(sent, bigrams)
    words = split(sent)
    words = [START_MARKER; words; END_MARKER]
    
    p=1.0
    for ii in 1:length(words)-1
        p*=bigrams[words[ii]][words[ii+1]]
    end
    p
end
likelyhood("what are the ground transport", kbigrams)

2.227339596503655e-6

In [14]:
function select_word{S<:AbstractString,V<:Number}(unigrams::Dict{S,V})
    cutoff = rand()
    total = 0.0
    for next_word in keys(unigrams)
        total+=unigrams[next_word]
        if total>=cutoff
            return next_word
        end
    end
    assert(False, "Should never reach here") 
end

function random_walk(bigrams)
    words=[]
    cur = START_MARKER
    while(cur!=END_MARKER)
        cur = select_word(bigrams[cur])
        push!(words,cur)
    end
    words = words[1:end-1]
    join(words, " ")
end
walk =random_walk(bigrams) 
print(walk*"\t")
print(likelyhood(walk,kbigrams))

list those flights leave boston from san francisco	3

In [15]:
function bigrams_dict2mat(bigrams::Dict, word_indexes::Dict{AbstractString,Int64}, dense=False)
    mat  = (dense ? zeros: spzeros)(length(word_indexes),length(word_indexes))
    for first in keys(bigrams)
        for second in keys(bigrams[first])
            mat[word_indexes[second], word_indexes[first]] = bigrams[first][second]
        end
    end
    mat
end

bigrams_dict2mat (generic function with 2 methods)

In [17]:
bigram_freq, unigram_freq = collect_grams_stats(known_corpus,false);
kbigrams=katz_bigrams(bigram_freq, unigram_freq)
kbigrams_mat = bigrams_dict2mat(kbigrams,word_indexes,true)


1117x1117 Array{Float64,2}:
 0.00058826   0.00155313   0.0025072    …  0.0  0.00266888   0.00450161   0.0
 0.000582436  0.00337239   0.00118728      0.0  0.00631926   0.000333665  0.0
 0.0077247    0.0213303    0.00158872      0.0  0.00631926   0.000946355  0.0
 0.000934227  0.00246655   0.0108999       0.0  0.0042385    0.0233828    0.0
 0.000938886  0.00247886   0.00400159      0.0  0.0646614    0.0063889    0.0
 6.29031e-5   0.000166077  0.000268097  …  0.0  0.000285385  0.000257788  0.0
 1.16487e-6   3.0755e-6    4.96475e-6      0.0  5.28491e-6   1.92168e-6   0.0
 5.82436e-6   1.53775e-5   0.00118728      0.0  2.64245e-5   9.60841e-6   0.0
 0.136584     0.0676306    0.00782777      0.0  0.00631926   0.00617917   0.0
 0.00013163   0.00080584   0.000561017     0.0  0.0153623    0.000946355  0.0
 1.63082e-5   4.30571e-5   6.95065e-5   …  0.0  7.39887e-5   0.000333665  0.0
 1.39785e-5   3.6906e-5    5.9577e-5       0.0  6.34189e-5   2.30602e-5   0.0
 5.00895e-5   0.000132247  0.0002134

.079353973670225e-10

In [24]:
function collect_cooccur_stats(sentences)
    unioccur = counter(AbstractString)
    bioccur = DefaultDict(()->counter(AbstractString))
    
    for sent in sentences
        for ii in 1:length(sent)
            push!(unioccur, sent[ii])
            for jj in 1:length(sent)
                if ii==jj
                    continue
                end
                push!(bioccur[sent[ii]], sent[jj])    
            end
        end       
    end
    
    [k=>v.map for (k,v) in bioccur], unioccur.map
    
end

collect_cooccur_stats (generic function with 2 methods)

In [26]:
bioccur_freq, unioccur_freq = collect_cooccur_stats(known_corpus)
kbioccur=katz_bigrams(bioccur_freq, unioccur_freq)
kbioccur_mat = bigrams_dict2mat(kbioccur,word_indexes,true)


1117x1117 Array{Float64,2}:
 0.00394297   0.00665376   0.0223628    …  0.0  0.041846     0.0  0.0
 0.00698873   0.00229434   0.0103838       0.0  0.0104231    0.0  0.0
 0.0128295    0.00568419   0.00418172      0.0  0.00917374   0.0  0.0
 0.025275     0.0282375    0.0316822       0.0  0.02423      0.0  0.0
 0.0176551    0.00859329   0.032126        0.0  0.0166903    0.0  0.0
 0.000490458  0.000382943  0.000648748  …  0.0  0.000462635  0.0  0.0
 9.08256e-6   9.34236e-6   1.20138e-5      0.0  8.56732e-6   0.0  0.0
 4.54128e-5   0.000265052  0.000369868     0.0  4.28366e-5   0.0  0.0
 0.0582956    0.0874184    0.0534292       0.0  0.0317775    0.0  0.0
 0.00318217   0.00109041   0.00905354      0.0  0.0116743    0.0  0.0
 0.000153281  0.000265052  0.00111239   …  0.0  0.000119942  0.0  0.0
 0.00041492   0.000265052  0.000144166     0.0  0.000102808  0.0  0.0
 0.000908921  0.00109041   0.000369868     0.0  0.000368395  0.0  0.0
 ⋮                                      ⋱                    ⋮

In [27]:
open("atis_data.jsz","w") do fh
    data = Dict([
        ("bigrams", kbigrams_mat),
        ("bioccur", kbioccur_mat),
        ("LL",LL),
        ("word_indexes", word_indexes),
        ("indexed_words", indexed_words),
        ])
    serialize(fh, data)    
end