# Download & Preprocess the IMDB Dataset

In [1]:
function pretty_print_review_and_label(i)
    println(labels[i] * "\t\t" * reviews[i][:80]*"...")
end
g = open("reviews.txt", "r");
reviews = map(x -> x[1:end-1], readlines(g))
close(g)

g = open("labels.txt", "r")
labels = map(x -> x[1:end-1], readlines(g))
close(g)

# Preprocess dataset:

f = open("reviews.txt")
raw_reviews = readlines(f)
close(f)

f = open("labels.txt")
raw_labels = readlines(f)
close(f)

tokens = collect(map(x -> Set(split(x, " ")), raw_reviews))

vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

input_dataset = []
for sent in tokens
    sent_indices = []
    for word in sent
        try
            push!(sent_indices, word2index[word])
        catch
            nothing
        end
    end
    push!(input_dataset, Set(sent_indices))
end

target_dataset = []
for label in raw_labels
    if label == "positive"
        push!(target_dataset, 1)
    else
        push!(target_dataset, 0)
    end
end

# The Surprising Power of Averaged Word Vectors

In [3]:
using Statistics:mean
norms = sum(weights_0_1 .* weights_0_1;dims=1)
normed_weights = weights_0_1 .* norms

function make_sent_vect(words)
    indices = map(x -> word2index[x],collect(filter(x -> haskey(word2index,x),words)))
    return mean(normed_weights[:,indices];dims=2)
end

reviews2vectors = []
for review in tokens # tokenized reviews
    push!(reviews2vectors, make_sent_vect(review))
end

reviews2vectors = reduce(hcat, reviews2vectors)
function most_similar_reviews(review)
    v = make_sent_vect(review)
    scores = Dict()
    for (i,val) in enumerate(v' * reviews2vectors)
        scores[i] = val
    end
    most_similar = []
    
    scores = sort(collect(scores), by = x -> x[2])
    for i in scores[1:5]
        idx,score = i
        push!(most_similar,raw_reviews[idx][1:50])
    end
    return most_similar
end

most_similar_reviews(["boring","awful"])

5-element Array{Any,1}:
 "just read the original story which is written by p"
 "more eeriness and dark secrets released in the fin"
 "an enjoyable game  which offers fun and easy game "
 "i have seen a lot of movies . . . this is the firs"
 "cheap  amateurish  unimaginative  exploitative . ."

# Matrices that Change Absolutely Nothing

In [4]:
using LinearAlgebra: I

a = [1,2,3]
b = [0.1,0.2,0.3]
c = [-1,-0.5,0]
d = [0,0,0]

Identity = Matrix(I,(3,3))

3×3 Array{Bool,2}:
 1  0  0
 0  1  0
 0  0  1

In [5]:
println(Identity * a)
println(Identity * b)
println(Identity * c)
println(Identity * d)

[1, 2, 3]
[0.1, 0.2, 0.3]
[-1.0, -0.5, 0.0]
[0, 0, 0]


In [6]:
this = [2,4,6]
movie = [10,10,10]
rocks = [1,1,1]

println(this + movie + rocks)
println(Identity * this + Identity * movie + Identity * rocks)

[13, 15, 17]
[13, 15, 17]


# Forward Propagation in Julia

In [7]:
using Random

function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp;dims=1)
end

word_vects = Dict()
word_vects["yankees"] = [0.,0.,0.]
word_vects["bears"] = [0.,0.,0.]
word_vects["braves"] = [0.,0.,0.]
word_vects["red"] = [0.,0.,0.]
word_vects["socks"] = [0.,0.,0.]
word_vects["lose"] = [0.,0.,0.]
word_vects["defeat"] = [0.,0.,0.]
word_vects["beat"] = [0.,0.,0.]
word_vects["tie"] = [0.,0.,0.]

sent2output = rand(length(word_vects),3)

Identity = 1.0 .* Matrix(I,(3,3));

In [8]:
layer_0 = word_vects["red"]
layer_1 = Identity * layer_0 + word_vects["socks"]
layer_2 = Identity * layer_1 + word_vects["defeat"]

pred = softmax(sent2output * layer_2)
println(pred)

[0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111]


# How do we Backpropagate into this?

In [10]:
y = [1,0,0,0,0,0,0,0,0] # target one-hot vector for "yankees"

pred_delta = pred .- y
layer_2_delta = sent2output' * pred_delta
defeat_delta = layer_2_delta .* 1 # can ignore the "1" like prev. chapter
layer_1_delta = Identity * layer_2_delta
socks_delta = layer_1_delta * 1 # again... can ignore the "1"
layer_0_delta = Identity * layer_1_delta
alpha = 0.01
word_vects["red"] .-= layer_0_delta .* alpha
word_vects["socks"] .-= socks_delta .* alpha
word_vects["defeat"] .-= defeat_delta .* alpha
Identity .-= layer_0' * layer_1_delta .* alpha
Identity .-= layer_1' * layer_2_delta .* alpha
sent2output .-= pred_delta * layer_2' .* alpha;

# Let's Train it!

In [11]:
raw = readlines("tasksv11/en/qa1_single-supporting-fact_train.txt")
close(f)

tokens = []
for line in raw[1:1000]
    push!(tokens, split(lowercase(line)," ")[2:end])
end


In [12]:
vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

function words2indices(sentence)
    idx =[]
    for word in sentence
        push!(idx, word2index[word])
    end
    return idx
end

function softmax(x)
    e_x = exp.(x .- maximum(x))
    return e_x ./ sum(e_x ;dims=2)
end

softmax (generic function with 1 method)

In [13]:
using Random: seed!
seed!(1)
embed_size = 10

# word embeddings
embed = (rand(embed_size, length(vocab)) .- 0.5) .* 0.1

# embedding -> embedding (initially the identity matrix)
recurrent = 1.0 .* Matrix(I,(embed_size, embed_size));

# sentence embedding for empty sentence
start = zeros(embed_size,1)

# embedding -> output weights
decoder = (rand(length(vocab), embed_size) .- 0.5) .* 0.1

# one hot lookups (for loss function)
one_hot = 1.0 .* Matrix(I,(length(vocab), length(vocab)));

# Forward Propagation with Arbitrary Length

In [14]:
function predict(sent)
    layers = []
    layer = Dict()
    layer["hidden"] = start
    push!(layers, layer)
    
    loss = 0
    
    # forward propagate
    preds = []
    for target_i=1:length(sent)
        
        layer = Dict()
        # try to predict the next term

        layer["pred"] = softmax(layers[end]["hidden"]' * decoder')  ##why transpose?
        
        loss += -log(layer["pred"][sent[target_i]])
        
        # generate the next hidden state
        
        layer["hidden"] = recurrent * layers[end]["hidden"] .+ embed[:,sent[target_i]]
        push!(layers, layer)
    end
    return layers, loss
end


predict (generic function with 1 method)

# Weight Update with Arbitrary Length

In [15]:
# forward

for iter=1:30000
    alpha = 0.001
    sent = words2indices(tokens[(iter % length(tokens))+1][2:end])
    
    layers,loss = predict(sent)

    # back propagate
    for layer_idx in reverse(1:length(layers))
        
        layer = layers[layer_idx]
          
        if (layer_idx > 1)
            target = sent[layer_idx-1]
            
            layer["output_delta"] = layer["pred"] - reshape(one_hot[:,target], (1,:))
            new_hidden_delta =  decoder' * layer["output_delta"]'
            
            # if the last layer - don't pull from a 
            # later one becasue it doesn't exist
            if(layer_idx == length(layers))
                layer["hidden_delta"] = new_hidden_delta
            else
                layer["hidden_delta"] = new_hidden_delta + recurrent' * layers[layer_idx+1]["hidden_delta"] 
            end
        else
            layer["hidden_delta"] = recurrent' * layers[layer_idx+1]["hidden_delta"]
        end
    end
    
    start -= layers[1]["hidden_delta"] .* (alpha / length(sent))
    
    for (layer_idx,layer) in enumerate(layers[2:end])
        decoder .-= layer["output_delta"]' * layers[layer_idx]["hidden"]' .* (alpha / length(sent))

        embed_idx = sent[layer_idx]
        embed[:,embed_idx] .-= dropdims(layers[layer_idx]["hidden_delta"] .* (alpha / length(sent));dims=2)
        recurrent -= layer["hidden_delta"] * layers[layer_idx]["hidden"]' .* (alpha / length(sent))
    end
    if((iter-1) % 1000 == 0)
        
        println("Perplexity: $(exp(loss/length(sent)))")
    end
end

Perplexity: 82.12804928343257
Perplexity: 81.9288000682471
Perplexity: 81.65027823771905
Perplexity: 81.13600268626917
Perplexity: 80.05772120028277
Perplexity: 77.52590840620103
Perplexity: 70.03713721328808
Perplexity: 41.84456883637827
Perplexity: 25.843694020403923
Perplexity: 19.57887591173625
Perplexity: 18.07436760577069
Perplexity: 16.47333734739069
Perplexity: 14.158483856457426
Perplexity: 11.069994257133942
Perplexity: 8.408635577993751
Perplexity: 7.179980870657079
Perplexity: 6.520578072729803
Perplexity: 6.156516553473082
Perplexity: 5.885074372332072
Perplexity: 5.594645689787941
Perplexity: 5.337171014994522
Perplexity: 5.147295077016582
Perplexity: 5.010296680607723
Perplexity: 4.910781155742089
Perplexity: 4.840917261834581
Perplexity: 4.796960480945621
Perplexity: 4.7757649057505756
Perplexity: 4.7589698400308365
Perplexity: 4.660993584220425
Perplexity: 4.401230989635085


In [16]:
sent_index = 5

l,_ = predict(words2indices(tokens[sent_index]))

println(tokens[sent_index])

for (i, each_layer) in enumerate(l[2:end-1])
    input = tokens[sent_index][i]
    True = tokens[sent_index][i+1]
    pred = vocab[argmax(each_layer["pred"][1,:])]
    println("Prev Input: $(input)  True: $(True)  Pred: $(pred)")
end


SubString{String}["sandra", "moved", "to", "the", "garden."]
Prev Input: sandra  True: moved  Pred: is
Prev Input: moved  True: to  Pred: to
Prev Input: to  True: the  Pred: the
Prev Input: the  True: garden.  Pred: bathroom.
