In [36]:
using PyCall
@pyimport nltk

In [37]:
function load_embeddings(embedding_file)
    embeddings = Dict{String,Vector{Float64}}()
    #sizehint!(embeddings, 268810)
    for line in eachline(open(embedding_file))
        fields = line |> split
        word = fields[1]
        vec = map(parsefloat, fields[2:end])
        embeddings[word] = vec
    end
    embeddings
end

load_embeddings (generic function with 1 method)

In [38]:
type WordEmbeddingResolver
    word2vec::Dict{String,Vector{Float64}}
    oov_vec::Vector{Float64} 
end


function WordEmbeddingResolver(embedding_file::String)
    word2vec = load_embeddings(embedding_file)
    oov_vec = zeros(size(word2vec["the"])) #Default it to zeros
    WordEmbeddingResolver(word2vec, oov_vec)
end

function WordEmbeddingResolver(embedding_file::String, oov_word::String)
    word2vec = load_embeddings(embedding_file)
    oov_vec = pop!(word2vec, oov_word)
    print(oov_vec)
    WordEmbeddingResolver(word2vec, oov_vec)
end


function get_embedding(resolver::WordEmbeddingResolver, word::String)
    get(resolver.word2vec, word, resolver.oov_vec) ::Vector{Float64}
end

function has_word(resolver::WordEmbeddingResolver, word::String)
    haskey(resolver.word2vec, word)
end

function vocab_words(resolver::WordEmbeddingResolver)
    Set(keys(resolver.word2vec))
end

vocab_words (generic function with 1 method)

In [39]:
#embeddings =  WordEmbeddingResolver("ACL2012_wordvectors.txt");
embeddings =  WordEmbeddingResolver("embeddings-scaled.EMBEDDING_SIZE=50.txt", "*UNKNOWN*");
#embeddings =  WordEmbeddingResolver("embeddings-scaled.EMBEDDING_SIZE=200.txt", "*UNKNOWN*");


[0.0398781252434,0.048215044877,0.0176077427638,-0.0251486353356,-0.0627525391573,0.025162565373,0.0117941274751,0.06136177709,0.018282828886,0.0289890409378,0.0305114695273,-0.0174257424723,0.0267882660686,-0.0147157088929,-0.00180382572305,-0.0240110330944,0.0348314405236,0.0331470930104,-0.072770576263,-0.0168486166738,-0.0239016486614,-0.0129813224306,-0.0289427776461,-0.0087953471783,-0.00791740590907,0.067884451273,-0.00875883697995,-0.00378532326298,-0.0147556102437,-0.0290714884049,-0.00577404736729,0.00911313898167,-0.0117291610829,-0.0562154301399,-0.0797217562057,-0.00686779437354,0.030256762279,0.00549994945996,-0.00159508308038,-0.0682754357922,-0.0136058665581,-0.0243803014322,-0.0181979406072,0.00254263276224,-0.0295248721186,-0.0390803609727,-0.0137563336122,0.0252651578644,0.0304082419565,0.0337323822856]

In [40]:
#Make sentences
function split_sentences(ss)
    eoss = [0, find((".".==ss) | ("!".==ss ) | ("?" .==ss)), size(ss,1)]
    [ss[eoss[ii]+1:eoss[ii+1]] for ii in 1:length(eoss)-1]
end
    

split_sentences (generic function with 1 method)

In [41]:
function get_sentence_vectors(document_str::String)
    token_sentences = document_str |> nltk.word_tokenize |> split_sentences

    [hcat([get_embedding(embeddings, word) for word in sentence]...)
        for sentence in token_sentences]
end

get_sentence_vectors (generic function with 1 method)

[http://nlp.stanford.edu/pubs/SocherLinNgManning_ICML2011.pdf]

http://repository.cmu.edu/cgi/viewcontent.cgi?article=1054&context=robotics

http://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used

In [42]:
function softmax(z)
    e.^z./sum(e.^z,1) 
end

softmax (generic function with 1 method)

In [45]:
type RNN
    W_score::Matrix{Float64}
    W_label::Matrix{Float64}
    W::Matrix{Float64}
end

function RNN(input_width::Int, num_labels::Int)
    RNN(0.01*randn(1,input_width+1),
        0.01*randn(num_labels,input_width+1),
        0.01*randn(input_width,input_width*2+1))
end

function eval_embedding(rnn::RNN, input_A, input_B)
    bias_in = ones(1,size([input_A; input_B], 2))
    tanh(rnn.W*[input_A;input_B;bias_in])
end

function eval_score(rnn::RNN, pp)
    bias_in = ones(1,size(pp, 2))
    tanh(rnn.W_score*[pp;bias_in])
end

function eval_label(rnn::RNN, pp)
    bias_in = ones(1,size(pp, 2))
    softmax(rnn.W_label*[pp;bias_in])
end

eval_label (generic function with 1 method)

In [95]:
function word_adjancy_matrix(sentence_len::Int)
    [(ii==jj+1) || (ii==jj-1)  for ii in 1:sentence_len, jj in 1:sentence_len]
end



function eval_to_tree(rr::RNN, sentence::AbstractArray{String})
    backlookup = sentence
    ss = hcat([get_embedding(embeddings, token) for token in backlookup]...)
    AA = word_adjancy_matrix(size(ss,2))
    
    score_total = 0.0
    while(any(AA))
        iis, jjs = findn(AA)

        pps = eval_embedding(rr, ss[:,iis],ss[:,jjs])
        scores = eval_score(rr, pps)
        best_pair_ind = indmax(scores)
        
         
        ii_best, jj_best = sort([iis[best_pair_ind], jjs[best_pair_ind]])
        #The above makes a more readable output, but doesn't do anything useful,
        #it is same as below:
        #ii_best = iis[best_pair_ind]
        #jj_best = jjs[best_pair_ind]
        pp_best = pps[:,best_pair_ind]
        score_total+=scores[best_pair_ind]
        
        ss = [ss pp_best]
        backlookup = [backlookup, (backlookup[ii_best],backlookup[jj_best])]
        
        #Adjust Adjacency Matrix

        AA[ii_best,jj_best] = false
        AA[jj_best,ii_best] = false
        AA = [AA; AA[ii_best,:] | AA[jj_best,:]] # Add row
        AA = [AA AA[:,ii_best] | AA[:,jj_best]] # Add col
        AA[ii_best,:] = AA[jj_best,:] = false
        AA[:,ii_best] = AA[:,jj_best] = false #Remove anything that was adjacent to old

    end
    tree = backlookup[end]
    embedding = ss[:,end]
    tree, embedding, score_total
end
    

eval_to_tree (generic function with 2 methods)

In [105]:
function tokenize(sentence::String)
    convert(Array{String,1},nltk.word_tokenize(sentence))
end

tokenize (generic function with 1 method)

In [107]:
sentence = "The house has a window."
sentence_toks = tokenize(sentence)
rr = RNN(50,3)
eval_to_tree(rr,sentence_toks)

(("The",("house",(".",("window",("has","a"))))),[-0.0168769,-0.0329426,0.0146024,0.0112469,-0.0151907,-0.00793882,0.011336,0.0431983,-0.0707231,0.0647944  …  0.0165571,-0.0100425,0.0304522,0.00663008,0.00529902,-0.0242028,0.0194972,-0.00380375,0.0236347,0.0576142],-0.06525092570204405)

6-element Array{String,1}:
 "The"   
 "house" 
 "has"   
 "a"     
 "window"
 "."     

In [56]:
function get_all_pos_tags()
    lines = split(open(readall,"brown_tags.txt"),"\n")
    desc_lines = filter(line -> contains(line,"||"), lines)
    tags = [split(line,"||")[1] for line in desc_lines]
end
pos_tags = get_all_pos_tags();

In [111]:
@pyimport nltk.corpus as nltk_corpus
for (ii,tagged_sent) in enumerate(nltk_corpus.brown[:tagged_sents]())
    sent, tags = [zip(tagged_sent...)...]
    
    println(ii, sent)
    if ii>10
        break
    end
end

1("The","Fulton","County","Grand","Jury","said","Friday","an","investigation","of","Atlanta's","recent","primary","election","produced","``","no","evidence","''","that","any","irregularities","took","place",".")
2("The","jury","further","said","in","term-end","presentments","that","the","City","Executive","Committee",",","which","had","over-all","charge","of","the","election",",","``","deserves","the","praise","and","thanks","of","the","City","of","Atlanta","''","for","the","manner","in","which","the","election","was","conducted",".")
3("The","September-October","term","jury","had","been","charged","by","Fulton","Superior","Court","Judge","Durwood","Pye","to","investigate","reports","of","possible","``","irregularities","''","in","the","hard-fought","primary","which","was","won","by","Mayor-nominate","Ivan","Allen","Jr.",".")
4("``","Only","a","relative","handful","of","such","reports","was","received","''",",","the","jury","said",",","``","considering","the","widespread","interest","i