# Preprocess text

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
#export
from exp.nb_11a import *

## Data

We will use the IMDB dataset.

In [None]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [None]:
path.ls()

[PosixPath('/home/jhoward/.fastai/data/imdb/unsup'),
 PosixPath('/home/jhoward/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/jhoward/.fastai/data/imdb/tmp_lm'),
 PosixPath('/home/jhoward/.fastai/data/imdb/ld.pkl'),
 PosixPath('/home/jhoward/.fastai/data/imdb/train'),
 PosixPath('/home/jhoward/.fastai/data/imdb/test'),
 PosixPath('/home/jhoward/.fastai/data/imdb/README'),
 PosixPath('/home/jhoward/.fastai/data/imdb/tmp_clas')]

An ItemList that will read the texts in the corresponding filenames.

In [None]:
#export
def read_file(fn): 
    with open(fn, 'r', encoding = 'utf8') as f: return f.read()
    
class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path): return read_file(i)
        return i

In [None]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [None]:
len(il.items)

100000

For text classification, we will split by the grand parent folder as before, but for language modeling, we take all the texts and just put 10% aside.

In [None]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [None]:
sd

SplitData
Train: TextList (89832 items)
[PosixPath('/home/jhoward/.fastai/data/imdb/unsup/5003_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/13043_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/1072_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/35637_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/30019_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/25190_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/27746_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/44322_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/2255_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/31544_0.txt')...]
Path: /home/jhoward/.fastai/data/imdb
Valid: TextList (10168 items)
[PosixPath('/home/jhoward/.fastai/data/imdb/unsup/24609_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/42624_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/5542_0.txt'), PosixPath('/home/jhoward/.fastai/data/imdb/unsup/25667_0.tx

## Tokenizing

We need to tokenize the dataset first. We will use a processor for this, in conjunction with the [spacy library](https://spacy.io/).

In [None]:
#export
import spacy,html

Before even tokenizeing, we will apply a bit of preprocessing on the texts to clean them up:

In [None]:
#export
BOS, EOS, UNK, PAD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxbos xxeos xxunk xxpad xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces between special characters"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [BOS, UNK, PAD, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [None]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [None]:
#export
from spacy.symbols import ORTH
from fastai.core import parallel

class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=5000, pre_rules=None, post_rules=None): 
        self.chunksize = chunksize
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, chunk, *args):
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=8)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([toks])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

## Numericalizing

In [None]:
#export
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
        self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)})  
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

When we do language modeling, we will infer the labels from the text during training, so there's no need to label. The training loop expects labels however, so we need to add dummy ones.

In [None]:
proc_tok,proc_num = TokenizeProcessor(),NumericalizeProcessor()

In [None]:
%time ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

CPU times: user 20 s, sys: 3.87 s, total: 23.8 s
Wall time: 1min 14s


In [None]:
ll.train.x_obj(0)

"xxbos i had always been interested in watching this well - regarded xxmaj british ghost story \x96 but was still pretty much blown away by it , being generally deemed too low - key for complete success . xxmaj the film ( the u.k. equivalent to the uninvited [ 1944 ] ) is notable for xxmaj james xxmaj mason 's playing of a character role much older than his 36 years ; he 's fine as always , but is matched by xxmaj barbara xxmaj mullen as his wife \x96 and the whole proves a nice showcase , too , for the young xxmaj dennis xxmaj price as a doctor . xxmaj the latter falls for and eventually treats xxmaj margaret xxmaj lockwood , xxmaj mullen 's ingénue companion \x96 who , on the old couple 's inexpensive acquisition of a fashionable but notorious country - house , becomes possessed by the spirit of the latest female occupant ( she had been ill - treated by both masters and servants and would die separated from her lover , another medic ) . xxmaj directed by a former cinematographer ( hi

In [None]:
pickle.dump(ll, open(path/'ld.pkl', 'wb'))

In [None]:
ll = pickle.load(open(path/'ld.pkl', 'rb'))

## Batching

We have a bit of work to convert our `LabelList` in a `DataBunch` as we don't just want batches of IMDB reviews. We want to stream through all the texts concatenated. We also have to prepare the targets that are the newt words in the text.

In [None]:
#export
class LanguageModelPreLoader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
        total_len = sum([len(t) for t in data.x])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
    
    def batchify(self):
        texts = self.data.x
        if self.shuffle: texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)

In [None]:
dl = DataLoader(LanguageModelPreLoader(ll.valid, shuffle=True), batch_size=64)

In [None]:
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)

In [None]:
x1.size(),y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [None]:
vocab = proc_num.vocab

In [None]:
" ".join(vocab[o] for o in x1[0])

'xxbos xxmaj this is another xxmaj jean - xxmaj claude xxmaj van xxmaj damme action flick , but a definite notch above many of his other films , at least in production values . xxmaj the photography and stereo sound are excellent . \n\n xxmaj actually , this is pretty simple stuff and old storyline of a low - key good guy rescuing the girl from local xxbos . xxmaj'

In [None]:
" ".join(vocab[o] for o in x2[0])

"here , xxmaj van xxmaj damme defends xxmaj rosanna xxmaj arquette and her small kids . \n\n xxmaj in another respect , the movie is just another avenue for the two stars to show off their incredible bodies . xxmaj neither has ever been shy about doing that and certainly are n't in this movie . xxmaj my only objection is having one of the kids make a sexual remark"

In [None]:
" ".join(vocab[o] for o in y1[0])

'xxmaj this is another xxmaj jean - xxmaj claude xxmaj van xxmaj damme action flick , but a definite notch above many of his other films , at least in production values . xxmaj the photography and stereo sound are excellent . \n\n xxmaj actually , this is pretty simple stuff and old storyline of a low - key good guy rescuing the girl from local xxbos . xxmaj here'

In [None]:
#export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LanguageModelPreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LanguageModelPreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))

In [None]:
#export
def lm_databunchify(sd, bs, bptt, **kwargs):
    dls = get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs)
    return DataBunch(*dls)

In [None]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

## Batching for classification

When we will want to tackle classification, gathering the data will be a bit different: first we will label our texts with the folder they come from, and then we will need to apply padding to batch them together. To avoid mixing very long texts with very short ones, we will also use `Sampler` to sort (with a bit of randomness for the training set) our samples by length.

In [None]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [TokenizeProcessor(), NumericalizeProcessor()], proc_y=CategoryProcessor())

In [None]:
pickle.dump(ll, open(path/'ll_clas.pkl', 'wb'))

In [None]:
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))

We saw samplers in notebook 03.

In [None]:
#export
from torch.utils.data import Sampler

class SortSampler(Sampler):
    def __init__(self, data_source, key): self.data_source,self.key = data_source,key
    def __len__(self): return len(self.data_source)
    def __iter__(self):
        return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True))

class SortishSampler(Sampler):
    def __init__(self, data_source, key, bs):
        self.data_source,self.key,self.bs = data_source,key,bs

    def __len__(self) -> int: return len(self.data_source)

    def __iter__(self):
        idxs = torch.randperm(len(self.data_source))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
        sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
        batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)]
        max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches]))  # find the chunk with the largest key,
        batches[0],batches[max_idx] = batches[max_idx],batches[0]            # then make sure it goes first.
        batch_idxs = torch.randperm(len(batches)-2)
        sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([])
        sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
        return iter(sorted_idx)

Padding

In [None]:
#export
def pad_collate(samples, pad_idx=1, pad_first=False):
    max_len = max([len(s[0]) for s in samples])
    res = torch.zeros(len(samples), max_len).long() + pad_idx
    for i,s in enumerate(samples):
        if pad_first: res[i,-len(s[0]):] = LongTensor(s[0])
        else:         res[i,:len(s[0]):] = LongTensor(s[0])
    return res, tensor([s[1] for s in samples])

In [None]:
train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs=bs)
train_dl = DataLoader(ll.train, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate)

In [None]:
iter_dl = iter(train_dl)
x,y = next(iter_dl)

In [None]:
x.size()

torch.Size([64, 3311])

In [None]:
#export
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
    train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
    valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
    return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))

In [None]:
#export
def clas_databunchify(sd, bs, **kwargs):
    dls = get_clas_dls(sd.train, sd.valid, bs, **kwargs)
    return DataBunch(*dls)

In [None]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs, bptt)

In [None]:
iter_dl = iter(data.train_dl)
x,y = next(iter_dl)

In [None]:
iter_dl = iter(data.valid_dl)
x,y = next(iter_dl)

## Export

In [None]:
!python notebook2script.py 12_text.ipynb

Converted 12_text.ipynb to exp/nb_12.py
