In [1]:
using PyCall
@pyimport nltk
@pyimport lda

In [2]:
function read_srt(f)
    text_lines = []
    while !eof(f)
        _seq_no = readline(f)
        _time_str = readline(f)
        
        while true
            text_line = strip(readline(f))
            if text_line == ""
                break
            end
            text_line = replace(text_line, "&#39;", "'")
            text_line = replace(text_line, "&gt;&gt;", "")
            push!(text_lines, text_line)
        end
    end
    join(text_lines, ' ')
end

function get_tokens()
    DATA_DIR = "../../data/"
    CAPTIONS_DIR = "compilers_captions/"
    SRT_NAMES = map((s) -> DATA_DIR * CAPTIONS_DIR * s, readdir(DATA_DIR * CAPTIONS_DIR))

    docs = []

    for name in SRT_NAMES
        open(name) do f
            doc = read_srt(f)
            push!(docs, doc)
        end
    end

    tokens = []
    for doc in docs
        doc = replace(doc, "'", "")
        doc = lowercase(doc)
        push!(tokens, nltk.word_tokenize(doc))
    end
    tokens
end

docs = get_tokens()

95-element Array{Any,1}:
 Any["welcome","to","this","course","on","compilers",".","my","name","is"  …  "and","talk","about","these","five","phases","in","more","detail","."]                 
 Any["welcome","back",",","in","this","second","half","of","the","lecture"  …  "well","look","at","each","of","these","phases","in","detail","."]                    
 Any["hello",".","in","this","video","were","going","to","talk","about"  …  "and","an","existing","systems","to","accommodate","those","new","applications","."]     
 Any["hello",",","in","this","and","the","next","few","videos","im"  …  "look","at","some","more","complex","examples","of","cool","programming","."]                
 Any["welcome","back",",","in","this","video","were","going","to","look"  …  "of","a","cool","program","with","some","non-trivial","data","structures","."]          
 Any["hello","again",".","in","this","video","were","gon","na","wrap"  …  "features","as","well","as","the","ones","weve","covered","here","."]  

In [28]:
function get_tdmatrix(docs)
    vocabulary = Set()
    for tokens in docs
        union!(vocabulary, tokens)
    end

    inverse_vocab = [i => s for (i, s) in enumerate(vocabulary)]
    vocabulary = [s => i for (i, s) in enumerate(vocabulary)]

    tdmatrix = zeros(Int64, length(docs), length(vocabulary))

    for (i, tokens) in enumerate(docs)
        for token in tokens
            tdmatrix[i, vocabulary[token],] += 1
        end
    end
    tdmatrix, vocabulary, inverse_vocab
end

tdmatrix, vocabulary, inverse_vocab = get_tdmatrix(docs)

(
95x4873 Array{Int64,2}:
 0  0  0  0  0  0  0  0  0  0  0   0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   0     0  0  0  0  0  0  3  0  0  0  0  0
 0  0  1  0  0  0  0  0  0  0  0   0     0  0  0  3  0  0  1  0  1  0  0  0
 0  0  0  0  0  0  0  0  1  0  0   0     0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   0     0  0  0  0  0  0  0  0  0  0  0  1
 0  0  0  0  3  0  0  0  0  0  0   0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0  10     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   2     0  1  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   1  …  0  0  0  0  0  0  1  0  0  0  0  0
 0  0  0  0  0  0  0  0  0  0  0   0     0  0  0  0  1  0  0  0  0  0  0  0
 0  0  0  0  0  0  0  2  0  1  0   0     0  0  0  0  0  0  0  

In [18]:
model = lda.LDA(n_topics=20, n_iter=500)

PyObject <lda.lda.LDA instance at 0x31d197e60>

In [19]:
model[:fit](tdmatrix)

INFO:lda:n_documents: 95
INFO:lda:vocab_size: 4873
INFO:lda:n_words: 194679
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -1800458
INFO:lda:<10> log likelihood: -1472392
INFO:lda:<20> log likelihood: -1400293
INFO:lda:<30> log likelihood: -1373203
INFO:lda:<40> log likelihood: -1358392
INFO:lda:<50> log likelihood: -1349379
INFO:lda:<60> log likelihood: -1340203
INFO:lda:<70> log likelihood: -1332266
INFO:lda:<80> log likelihood: -1323487
INFO:lda:<90> log likelihood: -1314525
INFO:lda:<100> log likelihood: -1305047
INFO:lda:<110> log likelihood: -1295415
INFO:lda:<120> log likelihood: -1289779
INFO:lda:<130> log likelihood: -1283737
INFO:lda:<140> log likelihood: -1274579
INFO:lda:<150> log likelihood: -1263606
INFO:lda:<160> log likelihood: -1255049
INFO:lda:<170> log likelihood: -1248998
INFO:lda:<180> log likelihood: -1241040
INFO:lda:<190> log likelihood: -1236153
INFO:lda:<200> log likelihood: -1231460
INFO:lda:<210> log likelihood: -1226458
INFO:lda:<22

PyObject <lda.lda.LDA instance at 0x31d197e60>

In [20]:
topic_word = model[:topic_word_]

20x4873 Array{Float64,2}:
 3.04439e-6  3.04439e-6   3.04439e-6   …  3.04439e-6  3.04439e-6 
 1.2567e-7   1.2567e-7    1.2567e-7       1.2567e-7   1.2567e-7  
 4.12928e-6  4.12928e-6   4.12928e-6      0.00206877  4.12928e-6 
 3.55572e-7  3.55572e-7   0.000142584     3.55572e-7  3.55572e-7 
 2.48897e-6  0.000500282  2.48897e-6      2.48897e-6  2.48897e-6 
 1.17762e-6  1.17762e-6   1.17762e-6   …  1.17762e-6  1.17762e-6 
 1.84375e-6  1.84375e-6   1.84375e-6      1.84375e-6  1.84375e-6 
 3.18397e-6  3.18397e-6   3.18397e-6      3.18397e-6  3.18397e-6 
 3.84064e-6  3.84064e-6   3.84064e-6      3.84064e-6  3.84064e-6 
 2.38907e-6  2.38907e-6   2.38907e-6      2.38907e-6  2.38907e-6 
 4.37045e-5  4.32718e-7   4.32718e-7   …  4.32718e-7  4.32718e-7 
 2.83388e-6  2.83388e-6   2.83388e-6      2.83388e-6  2.83388e-6 
 1.75755e-6  1.75755e-6   1.75755e-6      1.75755e-6  1.75755e-6 
 2.68476e-6  2.68476e-6   2.68476e-6      2.68476e-6  2.68476e-6 
 3.27254e-6  3.27254e-6   3.27254e-6      3.27254e

In [35]:
n_top_words = 8
n_topics, ~ = size(topic_word)
for i in 1:n_topics
    sorted_indices = sortperm(vec(topic_word[i, :]))
    words = [inverse_vocab[j] for j in sorted_indices[1:n_top_words]]
    println(words)
end

Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","rj"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","rj"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","rj","loads"]
Any["right-recursive","rearrangement","shouldn","null","everywhere","whose","rj","loads"]
Any["right-recursive","youd","shouldn","null","everywhere","whose","rj","favor"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","rj"]
Any["right-recursive","rearrangement","youd","shouldn","null","whose","rj","loads"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","rj"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","rj"]
Any["right-recursive","rearrangement","youd","shouldn","null","everywhere","whose","loads"]
Any["rearrangement","youd","shouldn","null","everywhere","whose","rj","loads"]
Any["right-recursive","rearrangement","y

In [34]:
topic_word

20x4873 Array{Float64,2}:
 3.04439e-6  3.04439e-6   3.04439e-6   …  3.04439e-6  3.04439e-6 
 1.2567e-7   1.2567e-7    1.2567e-7       1.2567e-7   1.2567e-7  
 4.12928e-6  4.12928e-6   4.12928e-6      0.00206877  4.12928e-6 
 3.55572e-7  3.55572e-7   0.000142584     3.55572e-7  3.55572e-7 
 2.48897e-6  0.000500282  2.48897e-6      2.48897e-6  2.48897e-6 
 1.17762e-6  1.17762e-6   1.17762e-6   …  1.17762e-6  1.17762e-6 
 1.84375e-6  1.84375e-6   1.84375e-6      1.84375e-6  1.84375e-6 
 3.18397e-6  3.18397e-6   3.18397e-6      3.18397e-6  3.18397e-6 
 3.84064e-6  3.84064e-6   3.84064e-6      3.84064e-6  3.84064e-6 
 2.38907e-6  2.38907e-6   2.38907e-6      2.38907e-6  2.38907e-6 
 4.37045e-5  4.32718e-7   4.32718e-7   …  4.32718e-7  4.32718e-7 
 2.83388e-6  2.83388e-6   2.83388e-6      2.83388e-6  2.83388e-6 
 1.75755e-6  1.75755e-6   1.75755e-6      1.75755e-6  1.75755e-6 
 2.68476e-6  2.68476e-6   2.68476e-6      2.68476e-6  2.68476e-6 
 3.27254e-6  3.27254e-6   3.27254e-6      3.27254e