# Neural Machine Translation

**Reference:** Sutskever, Ilya, Oriol Vinyals, and Quoc V. Le. "Sequence to sequence learning with neural networks." In Advances in neural information processing systems, pp. 3104-3112. 2014. ([Paper](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks), [Sample code](https://github.com/tensorflow/nmt))

In [1]:
import Pkg; using Pkg; Pkg.add("Knet")

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m


In [2]:
Pkg.add("IterTools"); Pkg.add("AutoGrad")

[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m


In [3]:
using Knet, Test, Base.Iterators, IterTools, Random # , LinearAlgebra, StatsBase
using AutoGrad: @gcheck  # to check gradients, use with Float64
#Knet.atype() = KnetArray{Float32}  # determines what Knet.param() uses.
macro size(z, s); esc(:(@assert (size($z) == $s) string(summary($z),!=,$s))); end # for debugging

@size (macro with 1 method)

## Part -1. Types from the last project

Please copy the following types and related functions from the last project: `Vocab`,
`TextReader`, `Embed`, `Linear`, `mask!`.

In [4]:
struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    M = 100000
    wdict = Dict()
    wcount = Dict()
    w2i(x) = get!(wdict, x, 1+length(wdict))
    w2c(key) = haskey(wcount, key) ? wcount[key] = wcount[key] + 1 : get!(wcount, key, 1)
    wcount[unk] = M; wcount[eos] = M
    i2w = []; 

    
    for line in eachline(file)
        words = tokenizer(line)
        w2c.(words)
    end
    
    sortedcount = sort(collect(wcount), by=x->x[2])
    words = sortedcount[findfirst(x-> x[2]>=mincount, sortedcount):length(sortedcount)]
    
    #vocabsize excludes unk & eos
    if(length(words) > vocabsize)
        words = words[length(words) - vocabsize + 1 : length(words)]
    end

    map(x-> w2i(x[1]) , words)
    map(x-> push!(i2w, x[1]), words)
    
    Vocab(wdict, i2w, wdict[unk], wdict[eos], tokenizer)
end
#=
function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    M = 1000000000000000
    wdict = Dict()
    wcount = Dict()
    w2i(x) = get!(wdict, x, 1+length(wdict))
    w2c(key) = haskey(wcount, key) ? wcount[key] = wcount[key] + 1 : get!(wcount, key, 1)
    wcount[eos] = M; wcount[unk] = M - 1
    i2w = []; 

    
    for line in eachline(file)
        words = tokenizer(line)
        w2c.(words)
    end
    
    sortedcount = sort(collect(wcount), by=x->x[2])
    words = sortedcount[findfirst(x-> x[2]>=mincount, sortedcount):length(sortedcount)]
    
    #vocabsize excludes unk & eos
    if(length(words) > vocabsize)
        words = words[length(words) - vocabsize + 1 : length(words)]
    end
    
    words = reverse(words)

    map(x-> w2i(x[1]) , words)
    map(x-> push!(i2w, x[1]), words)
    
    Vocab(wdict, i2w, wdict[unk], wdict[eos], tokenizer)
end
=#

struct TextReader
    file::String
    vocab::Vocab
end

function Base.iterate(r::TextReader, s=nothing)
    w2i(x) = get(r.vocab.w2i, x, r.vocab.unk)
    if (s === nothing) 
        s = open(r.file, "r")
    end

    if eof(s) 
        close(s)
        return nothing
    
    else
        tmp = readline(s)
        line = r.vocab.tokenizer(tmp)
        words = w2i.(line) 
        return words, s
    end    
end

Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}

struct Embed; w; end

function Embed(vocabsize::Int, embedsize::Int)
    Embed(param(embedsize, vocabsize))
end

function (l::Embed)(x)
    embedsz, vocabsz = size(l.w)
    tmparr = [embedsz]
    for dim in size(x)
        push!(tmparr, dim)
    end
    reshape(l.w[:,collect(flatten(x))], tuple(tmparr...))
end

struct Linear; w; b; end

function Linear(inputsize::Int, outputsize::Int)
    w = param(outputsize, inputsize)
    b = param0(outputsize)
    Linear(w,b)
end

function (l::Linear)(x)
    l.w * x .+ l.b
end

function mask!(a,pad)
    x,y = size(a)
    
    for i = 1:x
        tmp_mem = []
        isfirst = true
        for j = 1:y
            if a[i, j] == pad
                
                if isfirst
                    isfirst = false
                else
                    push!(tmp_mem, j)
                end
            else
                isfirst = true
                tmp_mem = []
            end
        end
        tmp_mem = convert(Array{Int,1}, tmp_mem)
        a[i, tmp_mem] .= 0
    end
    return a
end

mask! (generic function with 1 method)

## Part 0. Load data

We will use the Turkish-English pair from the [TED Talks Dataset](https://github.com/neulab/word-embeddings-for-nmt) for our experiments.

In [5]:
datadir = "datasets/tr_to_en"

if !isdir(datadir)
    download("http://www.phontron.com/data/qi18naacl-dataset.tar.gz", "qi18naacl-dataset.tar.gz")
    run(`tar xzf qi18naacl-dataset.tar.gz`)
end


if !isdefined(Main, :tr_vocab)
    tr_vocab = Vocab("$datadir/tr.train", mincount=5)
    en_vocab = Vocab("$datadir/en.train", mincount=5)
    tr_train = TextReader("$datadir/tr.train", tr_vocab)
    en_train = TextReader("$datadir/en.train", en_vocab)
    tr_dev = TextReader("$datadir/tr.dev", tr_vocab)
    en_dev = TextReader("$datadir/en.dev", en_vocab)
    tr_test = TextReader("$datadir/tr.test", tr_vocab)
    en_test = TextReader("$datadir/en.test", en_vocab)
    @info "Testing data"
    @test length(tr_vocab.i2w) == 38126
    @test length(first(tr_test)) == 16
    @test length(collect(tr_test)) == 5029
end

┌ Info: Testing data
└ @ Main In[5]:18


[32m[1mTest Passed[22m[39m

## Part 1. Minibatching

For minibatching we are going to design a new iterator: `MTData`. This iterator is built
on top of two TextReaders `src` and `tgt` that produce parallel sentences for source and
target languages.

In [6]:
struct MTData
    src::TextReader        # reader for source language data
    tgt::TextReader        # reader for target language data
    batchsize::Int         # desired batch size
    maxlength::Int         # skip if source sentence above maxlength
    batchmajor::Bool       # batch dims (B,T) if batchmajor=false (default) or (T,B) if true.
    bucketwidth::Int       # batch sentences with length within bucketwidth of each other
    buckets::Vector        # sentences collected in separate arrays called buckets for each length range
    batchmaker::Function   # function that turns a bucket into a batch.
end

#batchsize 128
function MTData(src::TextReader, tgt::TextReader; batchmaker = arraybatch, batchsize = 128, maxlength = typemax(Int),
                batchmajor = false, bucketwidth = 10, numbuckets = min(128, maxlength ÷ bucketwidth))
    buckets = [ [] for i in 1:numbuckets ] # buckets[i] is an array of sentence pairs with similar length
    MTData(src, tgt, batchsize, maxlength, batchmajor, bucketwidth, buckets, batchmaker)
end

Base.IteratorSize(::Type{MTData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{MTData}) = Base.HasEltype()
Base.eltype(::Type{MTData}) = NTuple{2}

### iterate(::MTData)

Define the `iterate` function for the `MTData` iterator. `iterate` should return a
`(batch, state)` pair or `nothing` if there are no more batches.  The `batch` is a
`(x::Matrix{Int},y::Matrix{Int})` pair where `x` is a `(batchsize,srclength)` batch of
source language sentences and `y` is a `(batchsize,tgtlength)` batch of parallel target
language translations. The `state` is a pair of `(src_state,tgt_state)` which can be used
to iterate `d.src` and `d.tgt` to get more sentences.  `iterate(d)` without a second
argument should initialize `d` by emptying its buckets and calling `iterate` on the inner
iterators `d.src` and `d.tgt` without a state. Please review the documentation on
iterators from the last project.

To keep similar length sentences together `MTData` uses arrays of similar length sentence
pairs called buckets.  Specifically, the `(src_sentence,tgt_sentence)` pairs coming from
`src` and `tgt` are pushed into `d.buckets[i]` when the length of the source sentence is
in the range `((i-1)*d.bucketwidth+1):(i*d.bucketwidth)`. When one of the buckets reaches
`d.batchsize` `d.batchmaker` is called with the full bucket producing a 2-D batch, the
bucket is emptied and the batch is returned. If `src` and `tgt` are exhausted the
remaining partially full buckets are turned into batches and returned in any order. If the
source sentence length is larger than `length(d.buckets)*d.bucketwidth`, the last bucket
is used.

Sentences above a certain length can be skipped using the `d.maxlength` field, and
transposed `x,y` arrays can be produced using the `d.batchmajor` field.

In [7]:
function Base.iterate(d::MTData, state=nothing)
    if (state === nothing) 
        
        for i = 1:length(d.buckets)
            d.buckets[i] = []
        end
        src = d.src
        tgt = d.tgt
        src = Iterators.Stateful(src)
        tgt = Iterators.Stateful(tgt)
    else
        src = state[1]
        tgt = state[2]
    end
    
    
    if(isempty(src)&&isempty(tgt))
        for i = 1:length(d.buckets)
            if(length(d.buckets[i]) > 0)
                tmp_batch = d.batchmaker(d, d.buckets[i])
                 if(d.batchmajor == true)
                    tmp_batch = (transpose(tmp_batch[1]), transpose(tmp_batch[2]))
                end
                d.buckets[i] = []
                return (tmp_batch, (src, tgt))
            end
        end
    end    
        
    while(!isempty(src) && !isempty(tgt))
        sentences = (popfirst!(src), popfirst!(tgt))
        src_sentence = sentences[1]
        tgt_sentence = sentences[2]
        src_length = length(src_sentence)
        
        if(src_length > d.maxlength)
            continue
        elseif(length(d.buckets)*d.bucketwidth < src_length)
            index_in_buckets = length(d.buckets)
        else
            index_in_buckets = ceil(src_length/d.bucketwidth)
        end
        
        index_in_buckets = convert(Int64, index_in_buckets)
        push!(d.buckets[index_in_buckets], (src_sentence, tgt_sentence))
        
        if(isempty(src) && isempty(tgt))
                tmp_batch = d.batchmaker(d, d.buckets[index_in_buckets])
                if(d.batchmajor == true)
                    tmp_batch = (transpose(tmp_batch[1]), transpose(tmp_batch[2]))
                end
                d.buckets[index_in_buckets] = []
                return (tmp_batch, (src, tgt))
        end  
        
        if(length(d.buckets[index_in_buckets]) == d.batchsize)
            tmp_batch = d.batchmaker(d, d.buckets[index_in_buckets])
            if(d.batchmajor == true)
                tmp_batch = (transpose(tmp_batch[1]), transpose(tmp_batch[2]))
            end
            d.buckets[index_in_buckets] = []
            return (tmp_batch, (src, tgt))
        end 
    end   
end

### arraybatch

Define `arraybatch(d, bucket)` to be used as the default `d.batchmaker`. `arraybatch`
takes an `MTData` object and an array of sentence pairs `bucket` and returns a
`(x::Matrix{Int},y::Matrix{Int})` pair where `x` is a `(batchsize,srclength)` batch of
source language sentences and `y` is a `(batchsize,tgtlength)` batch of parallel target
language translations. Note that the sentences in the bucket do not have any `eos` tokens
and they may have different lengths. `arraybatch` should copy the source sentences into
`x` padding shorter ones on the left with `eos` tokens. It should copy the target
sentences into `y` with an `eos` token in the beginning and end of each sentence and
shorter sentences padded on the right with extra `eos` tokens.

In [8]:
function arraybatch(d::MTData, bucket)
    # Your code here
    x = []
    y = []
    
    padded_x = Array{Int64,1}[]
    padded_y = Array{Int64,1}[]
    
    max_length_x = 0
    max_length_y = 0
    
    for sent_pair in bucket
        push!(x, sent_pair[1])
        push!(sent_pair[2], d.tgt.vocab.eos)
        pushfirst!(sent_pair[2], d.tgt.vocab.eos)
        push!(y, sent_pair[2])
        
        if(length(sent_pair[1]) > max_length_x)
            max_length_x = length(sent_pair[1])
        end
        
        if(length(sent_pair[2]) > max_length_y)
            max_length_y = length(sent_pair[2])
        end
    end
    for sent_pair in zip(x,y)
        x_pad_length = max_length_x - length(sent_pair[1])
        y_pad_length = max_length_y - length(sent_pair[2])
        x_pad_seq = repeat([d.src.vocab.eos], x_pad_length)
        y_pad_seq = repeat([d.tgt.vocab.eos], y_pad_length)
        push!(padded_x, append!(x_pad_seq, sent_pair[1]))
        push!(padded_y, append!(sent_pair[2], y_pad_seq))
    end
    
    no_of_sentences = length(padded_x)

    
    padded_x = permutedims(hcat(padded_x...), (2,1))
    padded_y = permutedims(hcat(padded_y...), (2,1))
    
    return (padded_x,padded_y)
end

arraybatch (generic function with 1 method)

In [9]:
@info "Testing MTData"
dtrn = MTData(tr_train, en_train)
ddev = MTData(tr_dev, en_dev)
dtst = MTData(tr_test, en_test)
x,y = first(dtst)

@test length(collect(dtst)) == 48
@test size.((x,y)) == ((128,10),(128,24))
@test x[1,1] == tr_vocab.eos
@test x[1,end] != tr_vocab.eos
@test y[1,1] == en_vocab.eos;
@test y[1,2] != en_vocab.eos
@test y[1,end] == en_vocab.eos

┌ Info: Testing MTData
└ @ Main In[9]:1


[32m[1mTest Passed[22m[39m

## Part 2. Sequence to sequence model without attention

In this part we will define a simple sequence to sequence encoder-decoder model for
machine translation.

In [10]:
mutable struct S2S_v1
    srcembed::Embed     # source language embedding
    encoder::RNN        # encoder RNN (can be bidirectional)
    tgtembed::Embed     # target language embedding
    decoder::RNN        # decoder RNN
    projection::Linear  # converts decoder output to vocab scores
    dropout::Real       # dropout probability to prevent overfitting
    srcvocab::Vocab     # source language vocabulary
    tgtvocab::Vocab     # target language vocabulary
end

### S2S_v1 constructor

Define the S2S_v1 constructor using your predefined layer types (Embed, Linear), and the
Knet RNN type. Please review the RNN documentation using `@doc RNN`, paying attention to
the following options in particular: `numLayers`, `bidirectional`, `dropout`, `dataType`,
`usegpu`. The last two are important if you experiment with array types other than the
default `KnetArray{Float32}`: make sure the RNNs use the same array type as the other
layers. Note that if the encoder is bidirectional, its `numLayers` should be half of the
decoder so that their hidden states match in size.

In [11]:
function S2S_v1(hidden::Int,         # hidden size for both the encoder and decoder RNN
                srcembsz::Int,       # embedding size for source language
                tgtembsz::Int,       # embedding size for target language
                srcvocab::Vocab,     # vocabulary for source language
                tgtvocab::Vocab;     # vocabulary for target language
                layers=1,            # number of layers
                bidirectional=false, # whether encoder RNN is bidirectional
                dropout=0)           # dropout probability
    
    srcembed = Embed(length(srcvocab.i2w), srcembsz)
    tgtembed = Embed(length(tgtvocab.i2w), tgtembsz)
    decoder_layers = layers
    if(bidirectional == true)
        decoder_layers = 2 * layers
    end
    
    encoder = RNN(srcembsz, hidden, rnnType = :lstm, bidirectional = bidirectional, dropout = dropout, numLayers = layers, h = 0)
    decoder = RNN(tgtembsz, hidden, rnnType = :lstm, dropout = dropout, numLayers = decoder_layers, h = 0)
    projection = Linear(hidden, length(tgtvocab.i2w))
    
    S2S_v1(srcembed, encoder, tgtembed, decoder, projection, dropout, srcvocab, tgtvocab)
    
end 

S2S_v1

### S2S_v1 loss function

Define the S2S_v1 loss function that takes `src`, a source language minibatch, and `tgt`,
a target language minibatch and returns either a `(total_loss, num_words)` pair if
`average=false`, or `(total_loss/num_words)` average if `average=true`.

Assume that `src` and `tgt` are integer arrays of size `(B,Tx)` and `(B,Ty)` respectively,
where `B` is the batch size, `Tx` is the length of the longest source sequence, `Ty` is
the length of the longest target sequence. The `src` sequences only contain words, the
`tgt` sequences surround the words with `eos` tokens at the start and end. This allows
columns `tgt[:,1:end-1]` to be used as the decoder input and `tgt[:,2:end]` as the desired
decoder output.

Assume any shorter sentences in the batches have been padded with extra `eos` tokens on
the left for `src` and on the right for `tgt`. Don't worry about masking `src` for the
encoder, it doesn't have a significant effect on the loss. However do mask `tgt` before
`nll`: you do not want the padding tokens to be counted in the loss calculation.

Please review `@doc RNN`: in particular the `r.c` and `r.h` fields can be used to get/set
the cell and hidden arrays of an RNN (note that `0` and `nothing` act as special values).

RNNs take a dropout value at construction and apply dropout to the input of every layer if
it is non-zero. You need to handle dropout for other layers in the loss function or in
layer definitions as necessary.

In [12]:
function (s::S2S_v1)(src, tgt; average=true)
    src_embed_tensor = s.srcembed(src)
    s.encoder.h = 0
    s.encoder.c = 0
    y_enc = s.encoder(src_embed_tensor)
    tgt_embed_tensor = s.tgtembed(tgt[:,1:end-1])
    s.decoder.h = copy(s.encoder.h)
    s.decoder.c = copy(s.encoder.c)
    y_dec = s.decoder(tgt_embed_tensor)
    hy, b ,ty = size(y_dec)
    y_dec = reshape(y_dec, (hy, b*ty))
    scores = s.projection(y_dec)
    #check dropout
    y_gold = mask!(tgt[:,2:end], s.tgtvocab.eos)
    nll(scores, y_gold; average = average)
end

In [13]:
@info "Testing S2S_v1"
Knet.seed!(1)

model = S2S_v1(512, 512, 512, tr_vocab, en_vocab; layers=2, bidirectional=true, dropout=0.2)
(x,y) = first(dtst)
# Your loss can be slightly different due to different ordering of words in the vocabulary.
# The reference vocabulary starts with eos, unk, followed by words in decreasing frequency.
@test model(x,y; average=false)[2] == (14097.471f0, 1432)[2] #our version
@test model(x,y; average=false)[1] ≈ (14097.471f0, 1432)[1]
#@test model(x,y; average=false) == (14097.471f0, 1432) ,original

┌ Info: Testing S2S_v1
└ @ Main In[13]:1


[32m[1mTest Passed[22m[39m

### Loss for a whole dataset

Define a `loss(model, data)` which returns a `(Σloss, Nloss)` pair if `average=false` and
a `Σloss/Nloss` average if `average=true` for a whole dataset. Assume that `data` is an
iterator of `(x,y)` pairs such as `MTData` and `model(x,y;average)` is a model like
`S2S_v1` that computes loss on a single `(x,y)` pair.

In [14]:
function loss(model, data; average=true)
    instances = 0
    cumulative_loss = 0
    for batch in data
        x, y = batch
        batch_loss, batch_instances = model(x,y; average=false)
        cumulative_loss += batch_loss
        instances += batch_instances
    end
    if (average)
        cumulative_loss / instances
    else
        cumulative_loss, instances
    end
end

loss (generic function with 1 method)

In [15]:
@info "Testing loss"
#@test loss(model, dtst, average=false) == (1.0429117f6, 105937) ,true
@test loss(model, dtst, average=false)[1] ≈ (1.0429117f6, 105937)[1] #our version
@test loss(model, dtst, average=false)[2] == (1.0429117f6, 105937)[2] #our version
# Your loss can be slightly different due to different ordering of words in the vocabulary.
# The reference vocabulary starts with eos, unk, followed by words in decreasing frequency.
# Also, because we do not mask src, different batch sizes may lead to slightly different
# losses. The test above gives (1.0429178f6, 105937) with batchsize==1.

┌ Info: Testing loss
└ @ Main In[15]:1


[32m[1mTest Passed[22m[39m

### Training SGD_v1

The following function can be used to train our model. `trn` is the training data, `dev`
is used to determine the best model, `tst...` can be zero or more small test datasets for
loss reporting. It returns the model that does best on `dev`.

In [16]:
function train!(model, trn, dev, tst...)

    bestmodel, bestloss = deepcopy(model), loss(model, dev)
    progress!(adam(model, trn), steps=100) do y
        losses = [ loss(model, d) for d in (dev,tst...) ]
        if losses[1] < bestloss
            bestmodel, bestloss = deepcopy(model), losses[1]
        end
        return (losses...,)
    end
    return bestmodel
end

train! (generic function with 1 method)

You should be able to get under 3.40 dev loss with the following settings in 10
epochs. The training speed on a V100 is about 3 mins/epoch or 40K words/sec, K80 is about
6 times slower. Using settings closer to the Luong paper (per-sentence loss rather than
per-word loss, SGD with lr=1, gclip=1 instead of Adam), you can get to 3.17 dev loss in
about 25 epochs. Using dropout and shuffling batches before each epoch significantly
improve the dev loss. You can play around with hyperparameters but I doubt results will
get much better without attention. To verify your training, here is the dev loss I
observed at the beginning of each epoch in one training session:
`[9.83, 4.60, 3.98, 3.69, 3.52, 3.41, 3.35, 3.32, 3.30, 3.31, 3.33]`

In [17]:
@info "Training S2S_v1"

model = Knet.load("s2s_v1.jld2","model")
epochs = 1
ctrn = collect(dtrn)
trnx10 = collect(flatten(shuffle!(ctrn) for i in 1:epochs))
trn20 = ctrn[1:20]
dev38 = collect(ddev)
# Uncomment this to train the model (This takes about 30 mins on a V100):
#model = train!(model, trnx10, dev38, trn20)
# Uncomment this to save the model:
#Knet.save("s2s_v2.jld2","model",model)
# Uncomment this to load the model:
#model = Knet.load("s2s_vDY.jld2","model")

┌ Info: Training S2S_v1
└ @ Main In[17]:1


38-element Array{Tuple{T,T} where T,1}:
 ([38124 38124 … 38123 38126; 38124 38124 … 38123 38126; … ; 38124 38124 … 38123 38126; 38124 38124 … 36514 38126], [18853 18815 … 18853 18853; 18853 18821 … 18853 18853; … ; 18853 18829 … 18853 18853; 18853 18847 … 18853 18853])
 ([37131 36447 … 37057 38126; 38124 38124 … 38102 38126; … ; 38124 38124 … 35612 38126; 38124 38124 … 38106 38126], [18853 18798 … 18856 18853; 18853 18849 … 18853 18853; … ; 18853 18837 … 18853 18853; 18853 18837 … 18853 18853])
 ([38124 38124 … 29405 38126; 38124 38124 … 38123 38126; … ; 38124 38124 … 38123 38126; 38124 38124 … 37789 38126], [18853 18837 … 18853 18853; 18853 18845 … 18853 18853; … ; 18853 18854 … 18853 18853; 18853 18774 … 18853 18853])
 ([38124 38073 … 37495 38126; 38124 38124 … 38043 38126; … ; 38124 38124 … 38123 38126; 38124 38124 … 37559 38126], [18853 18847 … 18853 18853; 18853 18821 … 18853 18853; … ; 18853 18844 … 18853 18853; 18853 18845 … 18853 18853])
 ([38124 38124 … 10541 38126; 38124 3812

### Generating translations

With a single argument, a `S2S_v1` object should take it as a batch of source sentences
and generate translations for them. After passing `src` through the encoder and copying
its hidden states to the decoder, the decoder is run starting with an initial input of all
`eos` tokens. Highest scoring tokens are appended to the output and used as input for the
subsequent decoder steps.  The decoder should stop generating when all sequences in the
batch have generated `eos` or when `stopfactor * size(src,2)` decoder steps are reached. A
correctly shaped target language batch should be returned.

In [18]:
function (s::S2S_v1)(src::Matrix{Int}; stopfactor = 3)

    isDone = false
    batch_size = size(src,1)
    first_input = repeat([s.tgtvocab.eos], batch_size)
    is_all_finished = zeros(batch_size)
    translated_sentences = copy(first_input)
    max_length_output = 0
    s.encoder.h = 0
    s.encoder.c = 0
    src_embed_tensor = s.srcembed(src)
    y_enc = s.encoder(src_embed_tensor)
    s.decoder.h = copy(s.encoder.h)
    s.decoder.c = copy(s.encoder.c)
    input = first_input
    
    while (!isDone && max_length_output < stopfactor*size(src,2))
        
        
        tgt_embed_tensor = s.tgtembed(input)
        y = s.decoder(tgt_embed_tensor)
    
        scores = s.projection(y)
        
        
        output_words = reshape(map(x->x[1], argmax(scores, dims = 1)), batch_size)
        translated_sentences = hcat(translated_sentences, output_words')
        max_length_output = size(translated_sentences, 2)
        input = output_words

        
        tmp_output_words = copy(output_words)
        tmp_output_words = tmp_output_words .== s.tgtvocab.eos
        is_all_finished += tmp_output_words
        if(sum(is_all_finished.==0)==0)
            isDone = true
        end
    end
    
    return translated_sentences
end

In [19]:
# Utility to convert int arrays to sentence strings
function int2str(y,vocab)
    y = vec(y)
    ysos = findnext(w->!isequal(w,vocab.eos), y, 1)
    ysos == nothing && return ""
    yeos = something(findnext(isequal(vocab.eos), y, ysos), 1+length(y))
    join(vocab.i2w[y[ysos:yeos-1]], " ")
end

int2str (generic function with 1 method)

In [None]:
@info "Generating some translations"
d = MTData(tr_dev, en_dev, batchsize=1) |> collect
(src,tgt) = rand(d)
out = model(src)
println("SRC: ", int2str(src,model.srcvocab))
println("REF: ", int2str(tgt,model.tgtvocab))
println("OUT: ", int2str(out,model.tgtvocab))
# Here is a sample output:
# SRC: çin'e 15 şubat 2006'da ulaştım .
# REF: i made it to china on february 15 , 2006 .
# OUT: i got to china , china , at the last 15 years .

### Calculating BLEU

BLEU is the most commonly used metric to measure translation quality. The following should
take a model and some data, generate translations and calculate BLEU.

In [20]:
function bleu(s2s,d::MTData)
    d = MTData(d.src,d.tgt,batchsize=1)
    reffile = d.tgt.file
    hypfile,hyp = mktemp()
    for (x,y) in progress(collect(d))
        g = s2s(x)
        for i in 1:size(y,1)
            println(hyp, int2str(g[i,:], d.tgt.vocab))
        end
    end
    close(hyp)
    isfile("multi-bleu.perl") || download("https://github.com/moses-smt/mosesdecoder/raw/master/scripts/generic/multi-bleu.perl", "multi-bleu.perl")
    run(pipeline(`cat $hypfile`,`perl multi-bleu.perl $reffile`))
    return hypfile
end

bleu (generic function with 1 method)

Calculating dev BLEU takes about 45 secs on a V100. We get about 8.0 BLEU which is pretty
low. As can be seen from the sample translations a loss of ~3+ (perplexity ~20+) or a BLEU
of ~8 is not sufficient to generate meaningful translations.

In [None]:
@info "Calculating BLEU"
bleu(model, ddev)

To improve the quality of translations we can use more training data, different training
and model parameters, or preprocess the input/output: e.g. splitting Turkish words to make
suffixes look more like English function words may help. Other architectures,
e.g. attention and transformer, perform significantly better than this simple S2S model.

*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*