In [1]:
import io
import spacy

from torchtext import data
from torchtext import datasets

In [None]:
def preprocessing(f_orig,f_txt):
    xml_tags = ['<url', '<keywords', '<talkid', '<description',
                    '<reviewer', '<translator', '<title', '<speaker']
    
    with io.open(f_txt, mode='w', encoding='utf-8') as fd_txt,
         io.open(f_orig, mode='r', encoding='utf-8') as fd_orig:
            
            for l in fd_orig:
                if not any(tag in l for tag in xml_tags):
                    fd_txt.write(l.strip() + '\n')

In [None]:
preprocessing('./de-en/train.tags.de-en.en','./data/train.en.txt')

In [None]:
preprocessing('./de-en/train.tags.de-en.de','./data/train.de.txt')

In [2]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

DE = data.Field(tokenize=tokenize_de,
                init_token='<SOS>',
                eos_token='<EOS>',
                fix_length=20,
                lower=True,
                batch_first=True)
EN = data.Field(tokenize=tokenize_en,
                init_token='<SOS>',
                eos_token='<EOS>',
                lower=True,
                fix_length=20,
                batch_first=True)

In [3]:
train = datasets.TranslationDataset(path='./data/train', 
                                    exts=('.de.txt', '.en.txt'),
                                    fields=(DE, EN))

In [4]:
test = datasets.TranslationDataset(path='./data/test', 
                                   exts=('.de.txt', '.en.txt'),
                                   fields=(DE, EN))

In [5]:
DE.build_vocab(train.src, min_freq=3)
EN.build_vocab(train, max_size=50000)

In [6]:
train_iter = data.BucketIterator(dataset=train, 
                                 batch_size=32,
                                 sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)))

In [7]:
test_iter = data.BucketIterator(dataset=test, 
                                batch_size=1)

In [10]:
test_batch = next(iter(test_iter))

In [6]:
batch = next(iter(train_iter))

In [12]:
test_batch.src

tensor([[   2,    6,  195,  437,   13,   82, 2076,    0,    5,    3,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]], device='cuda:0')

In [None]:
batch.trg

In [13]:
' '.join([EN.vocab.itos[i] for i in test_batch.trg[0]])

u'<SOS> and of course , we all share the same adaptive imperatives . <EOS> <pad> <pad> <pad> <pad> <pad> <pad>'

In [None]:
batch.src == 1