In [1]:
# import FunctionalCollections
push!(LOAD_PATH, ".")
push!(LOAD_PATH, "../util/")
import Iterators
import Pipe
import Compat
import JLD
import DataStructures
import DataStructuresExtended
@everywhere using FunctionalCollections
@everywhere using Iterators
@everywhere using Pipe
@everywhere using Compat
@everywhere using JLD
@everywhere using DataStructures
@everywhere using DataStructuresExtended

macro printval(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr," = ", $ee)))
end

macro pz(ee)
    ee_expr = @sprintf "%s" string(ee)
    esc(:(println($ee_expr,"\t\t",typeof($ee), "\t", size($ee))))
end





In [19]:
@everywhere const START_MARKER1 = Symbol("**START1**")
@everywhere const START_MARKER2 = Symbol("**START2**")
@everywhere const END_MARKER1 = Symbol("**END1**")
@everywhere const END_MARKER2 = Symbol("**END2**")
@everywhere typealias S Symbol
@everywhere typealias State{T} Tuple{T,T}
@everywhere const START_NODE_INDEX = 1
@everywhere const END_NODE_INDEX = 2

In [3]:
function load_counts(filename)
    counts = Dict{Tuple{Symbol,Symbol,Symbol},Int}()
    open(filename, "r") do fh
        for line in eachline(fh)
            word1,word2,word3,occurrences = split(line)
            trigram = (Symbol(word1),Symbol(word2),Symbol(word3))
            counts[trigram] = parse(Int,occurrences)
        end
    end
    counts
end

load_counts (generic function with 1 method)

In [4]:
"""
    Kneser-Ney estimate of a probability distribution. This is a version of
    back-off that counts how likely an n-gram is provided the n-1-gram had
    been seen in training. Extends the ProbDistI interface, requires a trigram
    FreqDist instance to train on. Optionally, a different from default discount
    value can be specified. The default discount is set to 0.75.
    #Adapted from: http://www.nltk.org/_modules/nltk/probability.html
"""
type KneserNeyProbDist{T}
    
    trigrams :: Accumulator{Tuple{T,T,T},Int}
    bigrams  :: Accumulator{Tuple{T,T},Int}
    trigrams_contain :: Accumulator{T,Int}
    discount :: Float64
    wordtypes_after :: Accumulator{Tuple{T,T},Int}
    wordtypes_before :: Accumulator{Tuple{T,T},Int}
end

In [5]:


"""
:param trigrams: The trigram frequency distribution upon which to base
    the estimation
:param discount: The discount applied when retrieving counts of
    trigrams
"""
function KneserNeyProbDist{T}(trigrams :: Dict{Tuple{T,T,T},Int}, discount=0.75)

    # helper dictionaries used to calculate probabilities
    trigrams_contain = counter(T)
    bigrams  = counter(Tuple{T,T})
    wordtypes_after = counter(Tuple{T,T})
    wordtypes_before = counter(Tuple{T,T})

    for ((w0, w1, w2),n) in trigrams
        push!(trigrams_contain, w1)
        push!(bigrams, (w0,w1), n)
        push!(wordtypes_after,(w0,w1))  
        push!(wordtypes_before,(w1,w2))
    end
    KneserNeyProbDist{T}(counter(trigrams), bigrams, trigrams_contain, discount, wordtypes_after, wordtypes_before)
end
    

function prob{T}(self::KneserNeyProbDist{T}, w0::T,w1::T, w2::T)
    # if the sample trigram was seen during training
    if (w0, w1,w2) in keys(self.trigrams)
        @assert self.trigrams[(w0,w1,w2)]>self.discount
        (self.trigrams[(w0,w1,w2)] - self.discount)/self.bigrams[(w0, w1)]
    # else if the 'rougher' environment was seen during training
    elseif (w0,w1) in keys(self.bigrams) && (w1,w2) in keys(self.wordtypes_before)
        aftr = self.wordtypes_after[(w0, w1)]
        bfr = self.wordtypes_before[(w1, w2)]

        # the probability left over from alphas
        leftover_prob = aftr * self.discount  / self.bigrams[(w0, w1)]
        @assert leftover_prob>0
        # the beta (including normalization)
        beta = bfr / (self.trigrams_contain[w1] - aftr)
        @assert beta>0
        leftover_prob * beta
    # else the sample was completely unseen during training
    else
        0.0
    end
end

prob (generic function with 1 method)

In [6]:
ccs = open(deserialize,"results/data/books/train_books_corpus.3gram.jsz", "r")

Dict{Tuple{Symbol,Symbol,Symbol},Int64} with 84718470 entries:
  (:fully,:in,:eating)     => 1
  (:shielding,:he,:tried)  => 1
  (symbol("'ve"),:had,:my… => 2
  (:slices,:again,:with)   => 2
  (:high,:over,:alicante)  => 2
  (:of,:cosmetic,:satisfa… => 1
  (:armor,symbol(","),:pi… => 1
  (:place,:the,:male)      => 2
  (:wink,:bens,:way)       => 2
  (:little,:too,:attentiv… => 4
  (:alan,:hard,:across)    => 1
  (:saw,:wyatt,:talking)   => 1
  (:title,symbol(";"),:fr… => 1
  (:you,:stay,:focussed)   => 2
  (:',:clean,:will)        => 3
  (:favorite,:pole,:.)     => 1
  (:the,:maudlin,:cry)     => 2
  (:faces,:of,symbol("111… => 1
  (:pulled,:abby,:against) => 6
  (:and,:queens,:have)     => 1
  (:not,:compare,:what)    => 1
  (:super,symbol(","),:th… => 1
  (:comes,symbol("''"),:r… => 2
  (symbol("**START2**"),:… => 6
  (:designed,:those,:.)    => 6
  ⋮                        => ⋮

In [7]:

kn_lm = KneserNeyProbDist(ccs)
ccs=0
kn_lm.trigrams |> length

84718470

In [11]:
@time prob(kn_lm, symbol("'m"), :so, :fast)

  

8.129023866814073e-6

0.000025 seconds (10 allocations: 320 bytes)


In [12]:
using JuMP
using MathProgBase
using GLPKMathProgInterface
using Gurobi

  likely near /home/ubuntu/.julia/v0.5/GLPK/src/GLPK.jl:812


In [35]:
"""
returns the a vector of sets of node indexes, each set is a subtour
The First subtour returned is the nonconnected one -- the path
"""
function get_subtours(x::Matrix{JuMP.Variable})
    x_val = getValue(x)
    x_iis,x_jjs, _  = findnz(x_val .>= 1 - 1e-6) #It just has to be close to 1 to be true
    nodes_chain = Dict([ii=>jj for (ii,jj) in zip(x_iis,x_jjs)])

    subtours = IntSet[]
    ii = START_NODE_INDEX
    push!(subtours, IntSet(END_NODE_INDEX)) #The END Node is always in the same subtour as the start node
    while(true)
        while(true) #Cycle through current subtour
            
            push!(subtours[end],ii) 
            
            jj = nodes_chain[ii]
            println(ii," ", jj)
            delete!(nodes_chain,ii)
            if jj∈subtours[end] 
                break 
            end
            ii=jj
        end   

        if length(nodes_chain)>0
            ii = first(keys(nodes_chain)) #start new subtour
            push!(subtours,IntSet())
        else
            break
        end
    end
    subtours
end

"""
Core tour goes from start to end (inclusive)
"""
function get_coretours(x_links)
    nodes_chain = Dict([ii=>jj for (ii,jj) in x_links])

    coretour = IntSet()
    ii = START_NODE_INDEX
    while(ii!=END_NODE_INDEX)            
        push!(coretour,ii) 
        ii = nodes_chain[ii]
    end   
    push!(coretour, END_NODE_INDEX)
    coretour
end

function get_hyperclass(subtour,nodes,node_indexes_for_1st)
    tour_hyper_class_nodes=IntSet()
    for ii in subtour
        w1,w2 = nodes[ii]
        #println(unordered_markers[w1])
        class_nodes = node_indexes_for_1st[w1]
        union!(tour_hyper_class_nodes,class_nodes) 
    end
    tour_hyper_class_nodes
end

function get_sentence{T}(x_val::Matrix,unordered_markers::T,nodes::Vector{State{Int}})
    x_iis,x_jjs, _  = findnz(x_val.>1-1e-6)
    nodes_chain = Dict([ii=>jj for (ii,jj) in zip(x_iis,x_jjs)])
    node_index=nodes_chain[START_NODE_INDEX]
    node_index=nodes_chain[node_index] #Skip the first two as they are the start nodes
    sent=T()
    while(node_index!=END_NODE_INDEX)
        node = nodes[node_index]
        push!(sent, unordered_markers[node[1]])
        node_index=nodes_chain[node_index]
    end
    sent
end

get_sentence (generic function with 1 method)

In [14]:
eg_corpus = @pipe ["name this 1922 novel about leopold bloom written by james joyce",#* " .",
    "ralph waldo emerson dismissed this poet as the jingle man and james russell lowell called him three-fifths genius and two-fifths sheer fudge",# * " .",
    "this is the basis of a comedy of manners first performed in 1892",#*" .",
    "in a third novel a sailor abandons the patna and meets marlow who in another novel meets kurtz in the congo",
    "thus she leaves her husband and child for aleksei vronsky but all ends sadly when she leaps in front of a train",
    "we looked out at the setting sun .",
    " i went to the kitchen .",
    "how are you doing ?"
    ] |>map(split,_) |> map(shuffle,_)

8-element Array{Array{SubString{ASCIIString},1},1}:
 SubString{ASCIIString}["about","name","james","leopold","bloom","novel","joyce","1922","by","written","this"]                                                                                      
 SubString{ASCIIString}["three-fifths","and","him","as","genius","called","two-fifths","emerson","waldo","the"  …  "poet","sheer","man","lowell","fudge","dismissed","jingle","ralph","james","and"]
 SubString{ASCIIString}["manners","is","of","comedy","this","of","first","basis","1892","performed","in","the","a"]                                                                                 
 SubString{ASCIIString}["a","sailor","novel","the","and","the","in","patna","congo","meets"  …  "abandons","another","in","marlow","third","kurtz","meets","who","a","novel"]                       
 SubString{ASCIIString}["aleksei","front","when","she","of","leaves","but","for","thus","sadly"  …  "train","husband","ends","leaps","she","all","a","child","vr

In [15]:
test_bag=eg_corpus[end]

#test_bag = shuffle(corpus[1022]) #length 28
#test_bag = shuffle(corpus[1028]) #length 20  (Gurodi 4.2, GLTK : 2064.6 seconds)
#test_bag = shuffle(corpus[1122]) #length 19 
#test_bag = shuffle(corpus[1000]) #length 17 (Gurbodi: 5.8 sconds. GLTK:  569.9seconds)
#test_bag = shuffle(["he was the greatest of man , a hero to many ."|> split])
#est_bag = shuffle(["this is but a silly joke -- that could never happen !"|> split])
#test_bag =  shuffle(["this","is","the" ,"basis","of","a" ,"comedy" ,"of","manners","first","performed","in","1892", "."])
#test_bag =  shuffle(["this","is","the" ,"basis","of","a" ,"comedy" ,"of","manners","."])
#test_bag =  shuffle(["this","is","the" ,"basis","of","a","fine","comedy", "."])
#^length 9, Gurodi 1.0, GLTK 5.3
#test_bag =  shuffle(["this","is","the" ,"basis","of","a" ,"comedy", "."])
#test_bag =  shuffle(["it", "is", "so", "very", "good", "."])
#test_bag =  shuffle(["it", "is", "very", "good", "."])
#test_bag =  shuffle(["it", "is", "good", "."])
#test_bag =  ["no", "way", "."]
#test_bag =  shuffle(["no", "."])
#@time best_order(test_bag, lm, mem_limit=1000)
test_bag = map(symbol, test_bag)

5-element Array{Symbol,1}:
 :doing
 :how  
 :?    
 :are  
 :you  

In [47]:
function prepare_model(unordered_words::Vector, lm::Function, silent=true)
    m=Model(solver=GurobiSolver(OutputFlag=silent? 0:1, Threads=1))

    unordered_markers = [START_MARKER1; START_MARKER2; END_MARKER1; END_MARKER2; unordered_words...]
    #Note that this lacks END_MARKER2
    @assert START_NODE_INDEX == 1
    @assert END_NODE_INDEX == 2

    nodes = State{Int}[] #Named by word index

    node_indexes_for_1st = Dict{Int, Vector{Int}}()
    node_indexes_for_2nd = Dict{Int, Vector{Int}}()

    function add_node!(ii,jj)
        push!(nodes, (ii,jj))

        node_indexes_for_i_1st = get!(()->Int[], node_indexes_for_1st, ii)
        push!(node_indexes_for_i_1st, length(nodes))

        node_indexes_for_j_2nd = get!(()->Int[], node_indexes_for_2nd, jj) 
        push!(node_indexes_for_j_2nd, length(nodes))
        #println("node:$(length(nodes)) |  $(unordered_markers[ii])($ii), $(unordered_markers[jj])($jj)")
    end

    add_node!(1, 2) #That is START_MARKER1-> START_MARKER2
    add_node!(3, 4) #That is END_MARKER1-> END_MARKER2

    for ii in 1:length(unordered_markers)
        wi = unordered_markers[ii]
        if wi∈(END_MARKER1,START_MARKER1) continue end  #Covered these

        for jj in 1:length(unordered_markers)
            if ii==jj continue end
            wj = unordered_markers[jj]
            if wj∉(START_MARKER1,START_MARKER2,END_MARKER2)
                #but wj can be  END_MARKER1
                add_node!(ii,jj)
            end
        end
    end

    @defVar(m, x[1:length(nodes), 1:length(nodes)], Bin)

    #If you enter a node you must also leave it
    for (cc, center_node) in enumerate(nodes)
        if cc ∉ (START_NODE_INDEX, END_NODE_INDEX) #the beginning and end done have this requiement
            #Everything that enters this node, must leave this node
            @addConstraint(m, sum{x[ii,cc], ii=1:length(nodes)} == sum{x[cc,jj], jj=1:length(nodes)})
        end
    end

    for class_index in 1:length(unordered_markers)
        w_class =  unordered_markers[class_index]
        if w_class∉(START_MARKER1,START_MARKER2)
            #Not rquired to make a transition so that START_MARKER1 or 2 ever occur in second position
            jjs = node_indexes_for_2nd[class_index]
            @addConstraint(m, sum{x[ii,jj],ii=1:length(nodes), jj=jjs}==1)
        end

        #The following constraint is not required, as if it shows uo in the second position other rules make it certain to show up in the first
        #if w_class∉(END_MARKER1,END_MARKER2)
        #    #Not rquired to make a transition so that END_MARKER1 or 2 ever occur in second position
        #    iis=node_indexes_for_1st[class_index]
        #    @addConstraint(m, sum{x[ii,jj], ii=iis,jj=1:length(nodes)}==1)
        #    rules[w_class*"*"]=Set(product(iis,1:length(nodes)))
        #end
    end

    log_trans_prob = spzeros(length(nodes), length(nodes))
    for (from_node_index, from_node) in enumerate(nodes)
        w1 = unordered_markers[from_node[1]]
        w2 = unordered_markers[from_node[2]]
        #If what was in the second state element does not end up in the first state element then it is not allowed.
        can_transition_to = Set(get(node_indexes_for_1st, from_node[2], Int[]))
            #You can transition to any node which has your second element as its first element
        for (to_node_index, to_node) in enumerate(nodes)
            if to_node_index in can_transition_to
                @assert(from_node[2]==to_node[1])
                w3= unordered_markers[to_node[2]]
                log_tp = log(lm(w1,w2,w3))
                if log_tp>-Inf
                    log_trans_prob[from_node_index, to_node_index] = log_tp
                    continue
                else
                    #Banned as prob zero transitions not allowed
                    @addConstraint(m, x[from_node_index,to_node_index]==0)
                end

            else
                #It is not a legal transition
                @addConstraint(m, x[from_node_index,to_node_index]==0)
            end
        end
    end
    
    function eleminate_subtours(cb)
        x_val = getValue(x)
        x_iis,x_jjs, _  = findnz(x_val .>= 1 - 1e-6) #It just has to be close to 1 to be true
        x_links=zip(x_iis,x_jjs)
        coretour = get_coretours(x_links)
        if length(coretour)==length(x_links)+1
            #no subtour to eleminate
            return
        end
        core_hyperclass = get_hyperclass(coretour, nodes, node_indexes_for_1st)

        #Also the First subtour must go to one of the other subtours 
        arcs_outof_coretour = AffExpr()
        for ii in core_hyperclass
            for jj in 1:length(nodes)
                if jj∉core_hyperclass
                    arcs_outof_coretour += x[ii,jj]
                end
            end
        end
        @addLazyConstraint(cb, arcs_outof_coretour >=1)
    end

    addLazyCallback(m, eleminate_subtours,fractional=false)

    #new Linear way using logprobs
    @setObjective(m, Max, sum{log_trans_prob[i,j]*x[i,j], i=1:length(nodes), j=1:length(nodes)})
    
    m,x, nodes,unordered_markers
end

prepare_model (generic function with 2 methods)

In [50]:
function lm(w0,w1,w2)
    prob(kn_lm,w0,w1,w2)
end

function best_order(unordered_words::Vector, lm::Function, silent=true)
    tic()
    m,x,nodes,unordered_markers = prepare_model(unordered_words, lm, silent)

    status = solve(m,suppress_warnings=silent)
    time_to_solve = toq()
    solution_prob = e^getObjectiveValue(m)
    generated_order = if status==:Optimal
        x_val = getValue(x)
        get_sentence(x_val, unordered_markers, nodes)
    else
        unordered_words 
    end
    (generated_order, solution_prob, time_to_solve, status)
end


best_order (generic function with 2 methods)

In [51]:
best_order(test_bag, lm)


([:how,:are,:you,:doing,:?],2.0009120789132414e-6,0.130566766,:Optimal)

In [52]:
map(eg_corpus) do sent
    bag = map(Symbol, sent)
    best_order(bag, lm)
end

8-element Array{Tuple{Array{Symbol,1},Float64,Float64,Symbol},1}:
 ([:about,:name,:james,:leopold,:bloom,:novel,:joyce,symbol("1922"),:by,:written,:this],NaN,0.307629377,:Infeasible)                                                                                      
 ([symbol("three-fifths"),:and,:him,:as,:genius,:called,symbol("two-fifths"),:emerson,:waldo,:the  …  :poet,:sheer,:man,:lowell,:fudge,:dismissed,:jingle,:ralph,:james,:and],NaN,4.864707588,:Infeasible)
 ([:manners,:is,:of,:comedy,:this,:of,:first,:basis,symbol("1892"),:performed,:in,:the,:a],NaN,0.433388144,:Infeasible)                                                                                   
 ([:a,:sailor,:novel,:the,:and,:the,:in,:patna,:congo,:meets  …  :abandons,:another,:in,:marlow,:third,:kurtz,:meets,:who,:a,:novel],NaN,4.18042117,:Infeasible)                                          
 ([:aleksei,:front,:when,:she,:of,:leaves,:but,:for,:thus,:sadly  …  :train,:husband,:ends,:leaps,:she,:all,:a,:child,:vro

In [None]:

function get_irreducible_inconsistent_subsystem(m::JuMP.Model)    
    grb_model = m.internalModel.inner
    num_constrs = Gurobi.num_constrs(grb_model)
    Gurobi.computeIIS(grb_model)
    iis_constrs = Gurobi.get_intattrarray(grb_model, "IISConstr",  1, num_constrs)
    m.linconstr[find(iis_constrs)]
end

In [None]:
iis = get_irreducible_inconsistent_subsystem(m)

In [None]:
count,constraint_index = indmax([length(constrant.terms.coeffs) for constrant in iis])
constraint = iis[constraint_index]

In [None]:
tts

In [None]:
m=Model(solver=GurobiSolver())
@defVar(m,x, Int)
@defVar(m,y, Int)
@addConstraint(m,y<=1000) #This one won't cause problems
@addConstraint(m,x>=6)    #This will
@addConstraint(m,y>=6)    #This will 
#@addConstraint(m,x+y<=11) #This will

status = solve(m)

#@assert status==:Infeasible
#get_irreducible_inconsistent_subsystem(m)

In [None]:
getObjectiveV

In [None]:
function get_prod(x, iis, jjs)
    net = 1.0
    for i in 1:size(x,1) 
        for j in 1:size(x,2)
            println(i,",", j, " =", x[i,j], " ", trans_prob[i,j])
            net*=max((x[i,j]-1)^2, trans_prob[i,j])
        end
    end
    net
end
values = falses(size(x))
values[1,3] = 1
values[3,6] = 1
values[6,7] = 1
println("----------\n")
get_prod(values, iis,jjs)


In [None]:
@pyimport nltk.corpus as nltk_corpus
corpus_reader=nltk_corpus.brown
corpus = Vector{ASCIIString}[[lowercase(word) for word in sent] for sent in (corpus_reader[:sents]()|> collect)]
const log_lm = train_language_model(corpus, loglikelyhood=true)