In [6]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchdata.datapipes.iter import FileOpener, IterableWrapper


In [7]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# ``train_iter`` was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)
test_data = data_process(test_iter)

In [10]:
data0 = [tokenizer(item) for item in train_iter]

In [20]:
data0[4]

['the',
 'game',
 'began',
 'development',
 'in',
 '2010',
 ',',
 'carrying',
 'over',
 'a',
 'large',
 'portion',
 'of',
 'the',
 'work',
 'done',
 'on',
 'valkyria',
 'chronicles',
 'ii',
 '.',
 'while',
 'it',
 'retained',
 'the',
 'standard',
 'features',
 'of',
 'the',
 'series',
 ',',
 'it',
 'also',
 'underwent',
 'multiple',
 'adjustments',
 ',',
 'such',
 'as',
 'making',
 'the',
 'game',
 'more',
 '<unk>',
 'for',
 'series',
 'newcomers',
 '.',
 'character',
 'designer',
 '<unk>',
 'honjou',
 'and',
 'composer',
 'hitoshi',
 'sakimoto',
 'both',
 'returned',
 'from',
 'previous',
 'entries',
 ',',
 'along',
 'with',
 'valkyria',
 'chronicles',
 'ii',
 'director',
 'takeshi',
 'ozawa',
 '.',
 'a',
 'large',
 'team',
 'of',
 'writers',
 'handled',
 'the',
 'script',
 '.',
 'the',
 'game',
 "'",
 's',
 'opening',
 'theme',
 'was',
 'sung',
 'by',
 'may',
 "'",
 'n',
 '.']