In [1]:
import numpy as np
import mxnet as mx
from collections import namedtuple, Counter
from unidecode import unidecode
from itertools import groupby
from mxnet.io import DataIter
from random import shuffle

import deepdish as dd

import operator
import pickle
import re
import warnings

  chunks = self.iterencode(o, _one_shot=True)


In [2]:
# Get rid of annoying Python deprecation warnings from built-in JSON encoder
warnings.filterwarnings("ignore", category=DeprecationWarning)   

In [3]:
# Decode text as UTF-8
# Remove diacritical signs and convert to Latin alphabet
# Separate punctuation as separate "words"
def tokenize_text(fname, vocab=None, invalid_label=0, start_label=1, sep_punctuation=True):
    lines = unidecode(open(fname).read().decode('utf-8')).split('\n')
    lines = [x for x in lines if x]
    lines = map(lambda x: re.findall(r"\w+|[^\w\s]", x, re.UNICODE), lines)    
    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label)
    return sentences, vocab

Dataset = namedtuple(
    'Dataset', 
    ['src_sent', 'src_vocab', 'inv_src_vocab', 'targ_sent', 'targ_vocab', 'inv_targ_vocab'])

def invert_dict(d):
    return {v: k for k, v in d.iteritems()}


def get_data(src_path, targ_path, start_label=1, invalid_label=0, pad_symbol='<PAD>'):
    src_sent, src_vocab = tokenize_text(src_path, start_label=start_label,
                                invalid_label=invalid_label)
    
    src_vocab[pad_symbol] = invalid_label
    inv_src_vocab = invert_dict(src_vocab)

    targ_sent, targ_vocab = tokenize_text(targ_path, start_label=start_label, #new_start+1,
                                          invalid_label=invalid_label)
    
    targ_vocab[pad_symbol] = invalid_label
    inv_targ_vocab = invert_dict(targ_vocab)
    
    return Dataset(
        src_sent=src_sent, src_vocab=src_vocab, inv_src_vocab=inv_src_vocab,
        targ_sent=targ_sent, targ_vocab=targ_vocab, inv_targ_vocab=inv_targ_vocab)

In [4]:
def persist_dataset(dataset, path):
    with open(path, 'wb+') as fileobj:
        pickle.dump(dataset, fileobj)
        
def load_dataset(path):
    with open(path, 'rb') as fileobj:
        return pickle.load(fileobj)

In [5]:
dataset = \
    get_data(
        src_path='./data/europarl-v7.es-en.en_small',
        targ_path='./data/europarl-v7.es-en.es_small',
        start_label=1,
        invalid_label=0
    )

In [84]:
class TwoDBisect:
    def __init__(self, buckets):
        self.buckets = sorted(buckets, key=operator.itemgetter(0, 1))
        self.x, self.y = zip(*buckets)
        self.x, self.y = np.array(list(self.x)), np.array(list(self.y))

    def twod_bisect(self, source, target):    
        offset1 = np.searchsorted(self.x, len(source), side='left')
        offset2 = np.where(self.y[offset1:] >= len(target))[0]        
        return self.buckets[offset1 + offset2[0]] 

class Seq2SeqIterator:    
    
    def __init__(self, dataset, buckets=None, batch_size=32, max_sent_len=None):
        self.src_sent = dataset.src_sent
        self.targ_sent = dataset.targ_sent
        # make this default to the maximum of the ???
        if buckets:
            z = zip(*buckets)
            self.max_sent_len = max(z[0], z[1])
            print(self.max_sent_len)
        else:
            self.max_sent_len = max_sent_len
        if max_sent_len:
            self.src_sent, self.targ_sent = self.filter_long_sent(self.src_sent, self.targ_sent, self.max_sent_len) 
        self.src_vocab = dataset.src_vocab
        self.targ_vocab = dataset.targ_vocab
        self.inv_src_vocab = dataset.inv_src_vocab
        self.inv_targ_vocab = dataset.inv_targ_vocab
        # Can't filter smaller counts per bucket if those sentences still exist!
        self.buckets = buckets if buckets else self.gen_buckets(
            self.src_sent, self.targ_sent, filter_smaller_counts_than=1, max_sent_len=max_sent_len)
        self.bisect = TwoDBisect(self.buckets)
        self.max_sent_len = max_sent_len
    
    def group_lengths(self):
#         short_sentences = filter(lambda x: len(x[0]) <= max_len and len(x[1]) <= max_len, zip(self.src_sent, self.targ_sent))
        tuples = []
        ctr = 0
        for src, targ in zip(self.src_sent, self.targ_sent):
            try:
                print(src)
                print(targ)
                len_tup = self.bisect.twod_bisect(src, targ)
                rev_src = src[::-1] 
                tuples.append((src, targ, len_tup))
                sorted_tuples = sorted(tuples, key=operator.itemgetter(2))
            except Exception as e:
                print(e)
                print(src)
                print(targ)
                break

        
    @staticmethod 
    def filter_long_sent(src_sent, targ_sent, max_len):
        return filter(lambda x: len(x[0]) <= max_len and len(x[1]) <= max_len, zip(src_sent, targ_sent))

    @staticmethod
    def gen_buckets(src_sent, targ_sent, filter_smaller_counts_than=None, max_sent_len=60, min_sent_len=1):
        length_pairs = map(lambda x: (len(x[0]), len(x[1])), zip(src_sent, targ_sent))
        counts = list(Counter(length_pairs).items())
        c_sorted = sorted(counts, key=operator.itemgetter(0, 1))
        buckets = [i[0] for i in c_sorted if i[1] >= filter_smaller_counts_than and 
                   (max_sent_len is None or i[0][0] <= max_sent_len) and
                   (max_sent_len is None or i[0][1] <= max_sent_len) and
                   (min_sent_len is None or i[0][0] >= min_sent_len) and
                   (min_sent_len is None or i[0][1] >= min_sent_len)]
        return buckets

#    def reset(self):
#         self.curr_idx = 0
#         random.shuffle(self.idx)
#         for buck in self.data:
#             np.random.shuffle(buck)

#         self.nddata = []
#         self.ndlabel = []
#         for buck in self.data:
#             label = np.empty_like(buck)
#             label[:, :-1] = buck[:, 1:]
#             label[:, -1] = self.invalid_label
#             self.nddata.append(ndarray.array(buck, dtype=self.dtype))
#             self.ndlabel.append(ndarray.array(label, dtype=self.dtype))

#     def next(self):
#         if self.curr_idx == len(self.idx):
#             raise StopIteration
#         i, j = self.idx[self.curr_idx]
#         self.curr_idx += 1

#         if self.major_axis == 1:
#             data = self.nddata[i][j:j+self.batch_size].T
#             label = self.ndlabel[i][j:j+self.batch_size].T
#         else:
#             data = self.nddata[i][j:j+self.batch_size]
#             label = self.ndlabel[i][j:j+self.batch_size]

#         return DataBatch([data], [label], pad=0,
#                          bucket_key=self.buckets[i],
#                          provide_data=[(self.data_name, data.shape)],
#                          provide_label=[(self.label_name, label.shape)])

In [85]:
i1 = Seq2SeqIterator(dataset)


In [None]:
i1.group_lengths()

[1, 2, 3, 4]
[1, 2, 3, 4, 5]
[5, 6, 7, 3, 4, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3, 30, 31, 24, 32, 25, 33, 34, 35, 36]
[6, 7, 8, 3, 4, 5, 2, 9, 10, 11, 12, 8, 13, 14, 4, 15, 16, 11, 17, 18, 19, 20, 21, 22, 23, 4, 24, 25, 26, 27, 28, 29, 30]
[37, 16, 38, 24, 39, 40, 41, 16, 3, 42, 43, 44, 45, 43, 46, 22, 47, 16, 48, 3, 49, 29, 25, 50, 2, 51, 52, 25, 53, 2, 54, 55, 31, 56, 57, 58, 36]
[31, 32, 33, 34, 35, 11, 8, 36, 37, 38, 2, 39, 40, 37, 41, 42, 43, 44, 30, 45, 46, 11, 47, 48, 4, 49, 4, 50, 51, 33, 52, 53, 4, 54, 55, 56, 57, 30]
[59, 40, 60, 25, 61, 11, 62, 63, 29, 3, 64, 2, 3, 65, 66, 67, 16, 68, 62, 69, 70, 4, 36]
[20, 21, 33, 58, 59, 60, 61, 8, 62, 63, 47, 64, 65, 11, 66, 8, 67, 4, 68, 3, 4, 5, 30]
[71, 3, 72, 16, 5, 73, 19, 22, 74, 25, 75, 43, 76, 77, 16, 38, 25, 50, 2, 78, 40, 60, 16, 11, 79, 2, 80, 3, 81, 82, 16, 83, 84, 2, 3, 85, 86, 16, 29, 3, 87, 51, 2, 3, 8, 88, 36]
[69, 70, 71, 4, 24, 42, 72, 11, 4, 73, 74, 75, 76, 2

In [70]:
src_sent = dataset.src_sent
targ_sent = dataset.targ_sent

sent_len = lambda x: map(lambda y: len(y), x)
max_len = lambda x: max(sent_len(x))
min_len = lambda x: min(sent_len(x))

min_len = min(min(sent_len(src_sent)), min(sent_len(targ_sent)))
# max_len = max(max(sent_len(src_sent)), max(sent_len(targ_sent)))

# min_len = min
max_len = 65
increment = 5

all_pairs = [(i, j) for i in range(
        min_len,max_len+increment,increment
    ) for j in range(
        min_len,max_len+increment,increment
    )]

In [71]:
i2 = Seq2SeqIterator(dataset, all_pairs)

In [73]:
i2.group_lengths()

[1, 2, 3, 4]
[1, 2, 3, 4, 5]
[5, 6, 7, 3, 4, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3, 30, 31, 24, 32, 25, 33, 34, 35, 36]
[6, 7, 8, 3, 4, 5, 2, 9, 10, 11, 12, 8, 13, 14, 4, 15, 16, 11, 17, 18, 19, 20, 21, 22, 23, 4, 24, 25, 26, 27, 28, 29, 30]
[37, 16, 38, 24, 39, 40, 41, 16, 3, 42, 43, 44, 45, 43, 46, 22, 47, 16, 48, 3, 49, 29, 25, 50, 2, 51, 52, 25, 53, 2, 54, 55, 31, 56, 57, 58, 36]
[31, 32, 33, 34, 35, 11, 8, 36, 37, 38, 2, 39, 40, 37, 41, 42, 43, 44, 30, 45, 46, 11, 47, 48, 4, 49, 4, 50, 51, 33, 52, 53, 4, 54, 55, 56, 57, 30]
[59, 40, 60, 25, 61, 11, 62, 63, 29, 3, 64, 2, 3, 65, 66, 67, 16, 68, 62, 69, 70, 4, 36]
[20, 21, 33, 58, 59, 60, 61, 8, 62, 63, 47, 64, 65, 11, 66, 8, 67, 4, 68, 3, 4, 5, 30]
[71, 3, 72, 16, 5, 73, 19, 22, 74, 25, 75, 43, 76, 77, 16, 38, 25, 50, 2, 78, 40, 60, 16, 11, 79, 2, 80, 3, 81, 82, 16, 83, 84, 2, 3, 85, 86, 16, 29, 3, 87, 51, 2, 3, 8, 88, 36]
[69, 70, 71, 4, 24, 42, 72, 11, 4, 73, 74, 75, 76, 2