# Text Preprocessing

Typical text preprocessing steps:

1. Load data.

2. Tokenize.

3. Build vocabulary.

4. Digitalize.
    
5. Form data iter

In [1]:
import os
import re
import random
import collections
import torch

## Load Data

In [2]:
#@save
def read_novels():
    """Read 10 novels."""
    lines = []
    folder_path = "./data/novels"
    for file in os.listdir(folder_path):
        if not file.startswith("."):
            lines += open(os.path.join(folder_path, file), "r").readlines()
    lines = [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
    return [line for line in lines if line]

## Tokenize

In [3]:
#@save
def tokenize(lines, token='char'):
    """Split lines into word or character tokens."""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)

## Build Vocabulary

In [4]:
#@save
class Vocab:
    """Vocabulary for tokens."""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Count and Sort frequencies
        counter = collections.Counter([token for line in tokens for token in line])
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], 
                                  reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        """tokens -> indices"""
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """indices -> tokens"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

## Digitalize

In [5]:
#@save
def load_corpus_novels():
    """return corpus and vocab"""
    lines = read_novels()
    tokens = tokenize(lines)
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line if vocab[token] != 0]
    return corpus, vocab

## Form Data Iter

In [6]:
#@save
def seq_data_iter(corpus, batch_size, num_steps):
    """Generate batches"""
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset + num_tokens]).reshape(batch_size, -1)
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens]).reshape(batch_size, -1)
    
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [7]:
#@save
class SeqDataLoader:
    """An iterator to load sequence data."""
    def __init__(self, batch_size, num_steps):
        self.corpus, self.vocab = load_corpus_novels()
        self.batch_size, self.num_steps = batch_size, num_steps
        self.data_iter_fn = seq_data_iter

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

## Putting All Things Together

In [8]:
#@save
def load_data_novels(batch_size, num_steps):
    """Return the iterator and the vocabulary of the novel dataset."""
    data_iter = SeqDataLoader(batch_size, num_steps)
    return data_iter, data_iter.vocab

In [9]:
data_iter, vocab = load_data_novels(2, 5)
for x, y in data_iter:
    print(x)
    print(y)
    break

tensor([[12,  6, 16,  2,  1],
        [25, 13,  2,  8,  3]])
tensor([[ 6, 16,  2,  1,  8],
        [13,  2,  8,  3,  6]])
