In [1]:
import pipeline
import filepaths as fp

import torch
from rnn_model import EncoderRNN, AttnDecoderRNN
from rnn_model_train import trainIters
from rnn_model_predict import predict_all, predict

In [6]:
def train_model(index_array_pairs, s_vocab_size, t_vocab_size, 
                max_length):
    
    # create Encoder/Decoder models 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = EncoderRNN(s_vocab_size, hidden_size).to(device)
    attn_decoder = AttnDecoderRNN(hidden_size, t_vocab_size, max_length, dropout_p).to(device)

    # train models and return losses to plot
    plot_every = n_epochs * len(index_array_pairs)/200.
    print_every=n_epochs * len(index_array_pairs)/25.
    plot_losses = trainIters(
        index_array_pairs, encoder, attn_decoder, n_epochs, max_length, 
        print_every, plot_every = plot_every, 
        learning_rate = learning_rate, max_hours = max_hours,
        clip = clip)
    
    # return trained models and info to plot the losses
    return encoder, attn_decoder, plot_losses, plot_every


In [None]:
#### TOY DATA

hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_epochs = 50
max_hours = 9
clip = 10

MAX_LENGTH = 24
(encoder, attn_decoder, slang, tlang, plot_losses, max_bpe_length) = pipeline.run(
    fp.spath_toy, fp.tpath_toy, 
    fp.spath_toy, fp.tpath_toy, 
    train_model, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = False, 
    use_bpe = True, num_operations = 80, vocab_threshold = 1,
    padding = False)


Data files preprocessed ...

28 inputs constructed for training ...

0m 8s (- 3m 16s) (56 4%) 4.4199
0m 20s (- 3m 51s) (112 8%) 4.1312
0m 28s (- 3m 30s) (168 12%) 3.9607
0m 38s (- 3m 23s) (224 16%) 3.7780
0m 47s (- 3m 9s) (280 20%) 3.5683
0m 54s (- 2m 53s) (336 24%) 3.3428
1m 2s (- 2m 40s) (392 28%) 3.1180
1m 9s (- 2m 28s) (448 32%) 2.9107
1m 17s (- 2m 17s) (504 36%) 2.7221
1m 26s (- 2m 9s) (560 40%) 2.5532
1m 34s (- 1m 59s) (616 44%) 2.3895
1m 41s (- 1m 50s) (672 48%) 2.2542


In [None]:
#### TUTORIAL DATA - no BPE

hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_iters = 75000
max_hours = 2


MAX_LENGTH = 10
(encoder, attn_decoder, slang, tlang, plot_losses) = pipeline.run(
    fp.spath_tutorial, fp.tpath_tutorial, 
    fp.spath_tutorial, fp.tpath_tutorial, 
    train_model, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = False, 
    use_bpe = False, num_operations = 200, vocab_threshold = 2,
    padding = False)

In [None]:
#### TUTORIAL DATA

hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_iters = 75000
max_hours = 9


MAX_LENGTH = 10
(encoder, attn_decoder, slang, tlang, plot_losses) = pipeline.run(
    fp.spath_tutorial, fp.tpath_tutorial, 
    fp.spath_tutorial, fp.tpath_tutorial, 
    train_model, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = False, 
    use_bpe = True, num_operations = 200, vocab_threshold = 2,
    padding = False)


In [None]:
#### TRAIN and TEST DATA using BPE

hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_iters = 300000
max_hours = 10
clip = 8
use_bpe = True
replace_unknown_words = True
padding = False
MAX_LENGTH = 17

(encoder, attn_decoder, slang, tlang, plot_losses, max_bpe_length) = pipeline.run(
    fp.spath_train, fp.tpath_train, 
    fp.spath_val, fp.tpath_val, 
    train_model, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = True, 
    use_bpe = True, num_operations = 400, vocab_threshold = 5, 
    padding = False)


In [None]:
#### TRAIN and TEST DATA no BPE

hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_iters = 80000
max_hours = 8

MAX_LENGTH = 25
(encoder, attn_decoder, slang, tlang, plot_losses) = pipeline.run(
    fp.spath_train, fp.tpath_train, 
    fp.spath_test, fp.tpath_test, 
    train_model, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = True, 
    use_bpe = False, 
    padding = False)


In [None]:
import filepaths as fp
import data_preparation as dp

from plots import showLosses, showAttention
from data_processing import preprocess, postprocess

def sanity_check(spath_train, tpath_train, 
        spath_test, tpath_test, 
        fn_train, fn_predict_all,
        max_sentence_length = 50, 
        replace_unknown_words = True, 
        use_bpe = True, num_operations = 400, vocab_threshold = 5,
        padding = True):

    # data preprocessing
    (spath_train_pp, tpath_train_pp, spath_test_pp, tpath_test_pp) = preprocess(
        spath_train, tpath_train, spath_test, tpath_test, 
        max_sentence_length,
        replace_unknown_words, 
        use_bpe, num_operations, vocab_threshold)

    print (f'Data files preprocessed ...')
    print ()
    
    # data structures for training
    (slang, tlang, index_array_pairs, s_index_arrays_test, max_bpe_length) = dp.prepare_data(
        spath_train_pp, tpath_train_pp, spath_test_pp, padding)
        
    return slang, tlang, index_array_pairs, max_bpe_length



In [None]:
hidden_size = 256 
dropout_p = 0.1
learning_rate = 0.01
n_iters = 300000
max_hours = 10
clip = 8
use_bpe = True
replace_unknown_words = True
padding = False

MAX_LENGTH = 17
slang, tlang, index_array_pairs, max_bpe_length = sanity_check(
    fp.spath_train, fp.tpath_train, 
    fp.spath_test, fp.tpath_test, 
    sanity_ckeck, predict_all, 
    max_sentence_length = MAX_LENGTH, 
    replace_unknown_words = replace_unknown_words, 
    use_bpe = use_bpe, 
    padding = padding)


In [None]:
for i in [0,1, 10000, 20000, 28317, 28318]:
    print(dp.sentenceFromIndexes(slang, index_array_pairs[i][0]))
    print(dp.sentenceFromIndexes(tlang, index_array_pairs[i][1]))
    print()

In [4]:
#encoder, attn_decoder, slang, tlang

EncoderRNN(
  (embedding): Embedding(114, 256)
  (gru): GRU(256, 256)
)

In [128]:
import data_preparation as dp

s_sentence = 'une a@@ b@@ e@@ il@@ le pl@@ an@@ ant a@@ u@@ -@@ de@@ s@@ su@@ s de f@@ l@@ e@@ ur@@ s v@@ i@@ ol@@ e@@ t@@ t@@ es et or@@ an@@ g@@ es .'
s_words = s_sentence.split(' ')
s_indices = dp.indexesFromSentence(
    slang, 
    s_sentence
)

t, a = predict(
        encoder, attn_decoder, s_indices, max_bpe_length)

t_words = dp.wordsFromIndexes(tlang,t)
A = a.numpy() # 19 output * 60 input

In [97]:
s_sentence.replace('@@ ', '')

'une abeille planant au-dessus de fleurs violettes et oranges .'

In [130]:
import numpy as np

merge_indices = [i for i, w in enumerate(s_words) if w.endswith('@@')]
resulting_columns = []
merge_column = np.array([])
resulting_words = []
merge_word = ''
for i, column in enumerate(A.T):
    if not merge_column.any():
        merge_column = column
        merge_word = s_words[i] if i < len(s_words) else ''
    else:
        merge_column = (merge_column + column)
        merge_word += s_words[i]
    if i not in merge_indices:
        resulting_words.append(merge_word)
        resulting_columns.append(merge_column)
        merge_column = np.array([])

In [131]:
resulting_words

['une',
 'a@@b@@e@@il@@le',
 'pl@@an@@ant',
 'a@@u@@-@@de@@s@@su@@s',
 'de',
 'f@@l@@e@@ur@@s',
 'v@@i@@ol@@e@@t@@t@@es',
 'et',
 'or@@an@@g@@es',
 '.',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [132]:
X = np.column_stack(resulting_columns)
X.shape

(19, 35)

In [133]:
merge_indices = [i for i, w in enumerate(t_words) if w.endswith('@@')]
resulting_rows = []
merge_row = np.array([])
resulting_words = []
merge_word = ''
for i, row in enumerate(X):
    if not merge_row.any():
        merge_row = row
        merge_word = t_words[i] if i < len(t_words) else ''
    else:
        merge_row = (merge_row + row)/2.
        merge_word += t_words[i]
    if i not in merge_indices:
        resulting_words.append(merge_word)
        resulting_rows.append(merge_row)
        merge_row = np.array([])

In [134]:
len(resulting_rows)

9

In [135]:
resulting_words

['a',
 'wat@@er@@f@@all',
 'in',
 'a',
 'for@@e@@s@@t',
 'with',
 'm@@an@@y',
 't@@re@@es',
 'EOS']

In [137]:
sum(resulting_rows[5])

0.999999986961484

In [None]:
resulting_rows = []
merge_row = np.array([])
for i, row in enumerate(A):
    if i > len(merge_indices):
        break    
    if not merge_column.any():
        merge_column = column
    else:
        merge_column = (merge_column + column)/2.
    if i not in merge_indices:
        resulting_columns.append(merge_column)
        merge_column = np.array([])

In [139]:
mj = np.column_stack(resulting_columns)

In [140]:
mj.shape

(19, 35)

In [142]:
sum(mj[5])

1.0000000128056854

In [147]:
def merge_bpe_s(s_words, A):
    merge_indices = [i for i, w in enumerate(s_words) if w.endswith('@@')]
    resulting_columns = []
    merge_column = np.array([])
    resulting_words = []
    merge_word = ''
    for i, column in enumerate(A.T):
        if not merge_column.any():
            merge_column = column
            merge_word = s_words[i] if i < len(s_words) else ''
        else:
            merge_column = (merge_column + column)
            merge_word += s_words[i]
        if i not in merge_indices:
            resulting_words.append(merge_word)
            resulting_columns.append(merge_column)
            merge_column = np.array([])
    return resulting_words, np.column_stack(resulting_columns)
        
def merge_bpe_t(t_words, X):
    merge_indices = [i for i, w in enumerate(t_words) if w.endswith('@@')]
    resulting_rows = []
    merge_row = np.array([])
    resulting_words = []
    merge_word = ''
    for i, row in enumerate(X):
        if not merge_row.any():
            merge_row = row
            merge_word = t_words[i] if i < len(t_words) else ''
        else:
            merge_row = (merge_row + row)/2.
            merge_word += t_words[i]
        if i not in merge_indices:
            resulting_words.append(merge_word)
            resulting_rows.append(merge_row)
            merge_row = np.array([])
    return resulting_words, np.row_stack(resulting_rows)
        
def merge_bpe(s_words, output_w, a):
    s_words_merged, X = merge_bpe_s(s_words, A)
    t_words_merged, attentions = merge_bpe_t(t_words, X)
    return (s_words_merged, t_words_merged, attentions)

q,r,s = merge_bpe(s_words, t_words, A)

In [153]:
len(q)

35

In [156]:
sum(s[0])

1.000000053900294

In [13]:
#from itertools import chain
chain(encoder.parameters(), attn_decoder.parameters())

<generator object chain at 0x7fb5ecfcd0a0>

In [10]:
def xchain(*iterables):
    # chain('ABC', 'DEF') --> A B C D E F
    for it in iterables:
        for element in it:
            yield element

In [12]:
encoder.parameters()

<generator object Module.parameters at 0x7fb5ecfcd728>

In [7]:
import math
math.ceil(1.2)

2