# Data Preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# export
from exp.nb_11a import *

## Data Setup

In [3]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [4]:
path.ls()

[PosixPath('/storage/data/imdb/README'),
 PosixPath('/storage/data/imdb/imdb.vocab'),
 PosixPath('/storage/data/imdb/tmp_lm'),
 PosixPath('/storage/data/imdb/tmp_clas'),
 PosixPath('/storage/data/imdb/unsup'),
 PosixPath('/storage/data/imdb/test'),
 PosixPath('/storage/data/imdb/train')]

In [5]:
# export
def read_file(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return f.read()

class TextList(ItemList):
    @classmethod
    def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
        return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
    
    def get(self, i):
        if isinstance(i, Path):
            return read_file(i)
        return i
    

In [6]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])

In [7]:
len(il.items)

100000

In [8]:
txt = il[0]
txt

"The worst movie I've ever seen, hands down. It is ten times more a rip-off of Lake Placid than it is a sequel. Director David Flores clearly did not go to film school, and the way his cast delivers they're lines, you'd think they were learning English for the first time. Not even Cloris Leachman tries. The first Nintendo games had more convincing special effects. Needless to say I didn't make it to the end of Lake Placid 2, but you don't need to watch more than five minutes to know that this is the biggest waste of celluloid in modern film. Do not let your love of the original tempt you to try this, even if you know it's bad. It's a turkey, no not even a turkey, it's nothing."

In [9]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [10]:
sd

SplitData
Train: TextList (89994 items)
[PosixPath('/storage/data/imdb/unsup/23490_0.txt'), PosixPath('/storage/data/imdb/unsup/46293_0.txt'), PosixPath('/storage/data/imdb/unsup/44643_0.txt'), PosixPath('/storage/data/imdb/unsup/18801_0.txt'), PosixPath('/storage/data/imdb/unsup/37510_0.txt'), PosixPath('/storage/data/imdb/unsup/17529_0.txt'), PosixPath('/storage/data/imdb/unsup/45937_0.txt'), PosixPath('/storage/data/imdb/unsup/7388_0.txt'), PosixPath('/storage/data/imdb/unsup/20708_0.txt'), PosixPath('/storage/data/imdb/unsup/1499_0.txt')...]
Path: /storage/data/imdb
Valid: TextList (10006 items)
[PosixPath('/storage/data/imdb/unsup/16722_0.txt'), PosixPath('/storage/data/imdb/unsup/6943_0.txt'), PosixPath('/storage/data/imdb/unsup/24382_0.txt'), PosixPath('/storage/data/imdb/unsup/26671_0.txt'), PosixPath('/storage/data/imdb/unsup/18669_0.txt'), PosixPath('/storage/data/imdb/unsup/20899_0.txt'), PosixPath('/storage/data/imdb/unsup/14424_0.txt'), PosixPath('/storage/data/imdb/unsup/

## Tokenizing

We need to tokenize the dataset first, which is splitting a sentence in individual tokens. Those tokens are the basic words or punctuation signs with a few tweaks: don't for instance is split between do and n't. We will use a processor for this, in conjunction with the [spacy library](https://spacy.io/).

In [11]:
# export
import spacy
import html

In [12]:
# export
# special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [13]:
replace_rep('cccc')

' xxrep 4 c '

In [14]:
replace_wrep('word word word word word ')

' xxwrep 5 word  '

These rules are applies after the tokenization on the list of tokens.

In [15]:
#export
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [16]:
replace_all_caps(['I', 'AM', 'SHOUTING'])

['I', 'xxup', 'am', 'xxup', 'shouting']

In [17]:
deal_caps(['My', 'name', 'is', 'Jeremy'])

['xxmaj', 'my', 'name', 'is', 'xxmaj', 'jeremy']

Since tokenizing and applying those rules takes a bit of time, we'll parallelize it using `ProcessPoolExecutor` to go faster.

In [18]:
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [19]:
#export
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [20]:
tp = TokenizeProcessor()

In [21]:
txt[:250]

"The worst movie I've ever seen, hands down. It is ten times more a rip-off of Lake Placid than it is a sequel. Director David Flores clearly did not go to film school, and the way his cast delivers they're lines, you'd think they were learning Englis"

In [22]:
' • '.join(tp(il[:100])[0])[:400]

"xxbos • xxmaj • the • worst • movie • i • 've • ever • seen • , • hands • down • . • xxmaj • it • is • ten • times • more • a • rip • - • off • of • xxmaj • lake • xxmaj • placid • than • it • is • a • sequel • . • xxmaj • director • xxmaj • david • xxmaj • flores • clearly • did • not • go • to • film • school • , • and • the • way • his • cast • delivers • they • 're • lines • , • you • 'd • thi"

## Numericalizing

In [23]:
#export
import collections

class NumericalizeProcessor(Processor):
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)}) 
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

When we do language modeling, we will infer the labels from the text during training, so there's no need to label. The training loop expects labels however, so we need to add dummy ones.

In [24]:
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor()

In [25]:
%time ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

CPU times: user 33.2 s, sys: 9.78 s, total: 43 s
Wall time: 4min 44s


Once the items have been processed they will become list of numbers, we can still access the underlying raw data in `x_obj` (or `y_obj` for the targets, but we don't have any here).

In [26]:
ll.train.x_obj(0)

"xxbos xxmaj the worst movie i 've ever seen , hands down . xxmaj it is ten times more a rip - off of xxmaj lake xxmaj placid than it is a sequel . xxmaj director xxmaj david xxmaj flores clearly did not go to film school , and the way his cast delivers they 're lines , you 'd think they were learning xxmaj english for the first time . xxmaj not even xxmaj cloris xxmaj leachman tries . xxmaj the first xxmaj nintendo games had more convincing special effects . xxmaj needless to say i did n't make it to the end of xxmaj lake xxmaj placid 2 , but you do n't need to watch more than five minutes to know that this is the biggest waste of celluloid in modern film . xxmaj do not let your love of the original tempt you to try this , even if you know it 's bad . xxmaj it 's a turkey , no not even a turkey , it 's nothing . xxeos"

In [27]:
pickle.dump(ll, open(datasets.Config.model_path()/'imdb'/'ld.pkl', 'wb'))

In [25]:
ll = pickle.load(open(datasets.Config.model_path()/'imdb'/'ld.pkl', 'rb'))

## Batching

We have a bit of work to convert our `LabelList` in a `DataBunch` as we don't just want batches of IMDB reviews. We want to stream through all the texts concatenated. We also have to prepare the targets that are the newt words in the text. All of this is done with the next object called `LM_PreLoader`. At the beginning of each epoch, it'll shuffle the articles (if `shuffle=True`) and create a big stream by concatenating all of them. We divide this big stream in `bs` smaller streams. That we will read in chunks of bptt length.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=5565)

In [28]:
# Just using those for illustration purposes, they're not used otherwise.
from IPython.display import display,HTML
import pandas as pd

Let's say our stream is:

In [29]:
stream = """
In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface. 
First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API.
Then we will study how we build a language model and train it.\n
"""
tokens = np.array(tp([stream])[0])

In [30]:
bs, seq_len = 6, 15
d_tokens = np.array([tokens[i * seq_len : (i + 1) * seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,xxbos,\n,xxmaj,in,this,notebook,",",we,will,go,back,over,the,example,of
1,classifying,movie,reviews,we,studied,in,part,1,and,dig,deeper,under,the,surface,.
2,\n,xxmaj,first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into
3,numbers,and,how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have
4,another,example,of,the,xxmaj,processor,used,in,the,data,block,api,.,\n,xxmaj
5,then,we,will,study,how,we,build,a,language,model,and,train,it,.,\n\n


Then if we have a `bptt` of 5, we would go over those three batches.

In [31]:
bs, bptt = 6, 5

for k in range(3):
    d_tokens = np.array([tokens[i * seq_len + k * bptt : i * seq_len + (k + 1) * bptt] for i in range(bs)])
    df = pd.DataFrame(d_tokens)
    display(HTML(df.to_html(index=True, header=False)))

0,1,2,3,4,5
0,xxbos,\n,xxmaj,in,this
1,classifying,movie,reviews,we,studied
2,\n,xxmaj,first,we,will
3,numbers,and,how,to,customize
4,another,example,of,the,xxmaj
5,then,we,will,study,how


0,1,2,3,4,5
0,notebook,",",we,will,go
1,in,part,1,and,dig
2,look,at,the,processing,steps
3,it,.,xxmaj,by,doing
4,processor,used,in,the,data
5,we,build,a,language,model


0,1,2,3,4,5
0,back,over,the,example,of
1,deeper,under,the,surface,.
2,necessary,to,convert,text,into
3,this,",",we,'ll,have
4,block,api,.,\n,xxmaj
5,and,train,it,.,\n\n


In [32]:
# export
class LM_Preloader():
    def __init__(self, data, bs=64, bptt=70, shuffle=False):
        self.data = data
        self.bs = bs
        self.bptt = bptt
        self.shuffle = shuffle
        total_len = sum([len(text) for text in data.x])
        self.n_batch = total_len // bs
        self.batchify()
    
    def __len__(self):
        return ((self.n_batch - 1) // self.bptt) * self.bs
    def __getitem__(self, idx):
        source = self.batched_data[idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        return source[seq_idx : seq_idx + self.bptt], source[seq_idx + 1 : seq_idx + 1 + self.bptt]
    def batchify(self):
        texts = self.data.x
        if self.shuffle:
            texts = texts[torch.randperm(len(texts))]
        stream = torch.cat([torch.tensor(t) for t in texts])
        self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)

In [33]:
dl = DataLoader(LM_Preloader(ll.valid, shuffle=True), batch_size=64)

In [34]:
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)

In [35]:
x1.size(), y1.size()

(torch.Size([64, 70]), torch.Size([64, 70]))

In [36]:
vocab = proc_num.vocab

In [37]:
" ".join(vocab[o] for o in x1[0])

'xxbos xxmaj well . xxmaj astronaut xxmaj steve xxmaj west sits in a plastic space capsule , commenting that " you have n\'t lived until you \'ve seen the sun through the rings of xxmaj saturn " , all the while the obvious mid - day sunlight is streaming through the window , when suddenly he has a nose bleed . xxmaj next , xxmaj west is back home in'

In [38]:
" ".join(vocab[o] for o in y1[0])

'xxmaj well . xxmaj astronaut xxmaj steve xxmaj west sits in a plastic space capsule , commenting that " you have n\'t lived until you \'ve seen the sun through the rings of xxmaj saturn " , all the while the obvious mid - day sunlight is streaming through the window , when suddenly he has a nose bleed . xxmaj next , xxmaj west is back home in some'

In [39]:
" ".join(vocab[o] for o in x2[0])

'some secret hospital , a melting gelatinous mass who goes berserk and causes a chunky nurse to run through a fake glass door . xxmaj apparently , xxmaj west " gets stronger as he melts " , which makes about as much sense as anything in this hopelessly xxunk , xxunk - brained moovie . xxmaj then this dopey " xxmaj army xxmaj brass " , who looks kind of'

And let's prepare some convenience function to do this quickly.

In [40]:
# export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
    return (DataLoader(LM_Preloader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
            DataLoader(LM_Preloader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))

def lm_databunchify(sd, bs, bptt, **kwargs):
    return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))

In [41]:
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

## Batching for classification

When we will want to tackle classification, gathering the data will be a bit different: first we will label our texts with the folder they come from, and then we will need to apply padding to batch them together. To avoid mixing very long texts with very short ones, we will also use `Sampler` to sort (with a bit of randomness for the training set) our samples by length.

First the data block API calls shold look familiar.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=5877)

In [42]:
proc_cat = CategoryProcessor()

In [43]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat)

In [44]:
pickle.dump(ll, open(datasets.Config.model_path()/'imdb'/'ll_clas.pkl', 'wb'))

In [45]:
ll = pickle.load(open(datasets.Config.model_path()/'imdb'/'ll_clas.pkl', 'rb'))

Let's check the labels seem consistent with the texts.

In [46]:
[(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,12552]]

[("xxbos xxmaj visually stunning ? xxmaj most definitely . i have seen few films look this good in some time . xxmaj sky xxmaj captain and the xxmaj world of xxmaj tomorrow uses striking cinematography , computer graphics , and creative futuristic designs to create a world that is historically familiar yet something quite fresh . xxmaj the time period seems to be the 1930s or early 40s . xxmaj the movie tells of recent attacks on xxmaj new xxmaj york xxmaj city by xxunk armies stealing generators and the like for some inexplicable reason . xxmaj also , mysterious disappearances of relevant scientific minds coincide . xxmaj who can stop them and save the world ? xxmaj alright , it does n't take a leap of faith to know it is the xxmaj sky xxmaj captain himself with his wisecracking reporter girlfriend always hot for a lead , and in the wings his trusty , thoroughly competent sidekick . xxmaj what xxmaj sky xxmaj captain has in atmosphere and graphics it lacks in storytelling and characte

We saw samplers in notebook 03. For the validation set, we will simply sort the samples by length, and we begin with the longest ones for memory reasons (it's better to always have the biggest tensors first).

In [47]:
# export
from torch.utils.data import Sampler

class SortSampler(Sampler):
    def __init__(self, data_source, key):
        self.data_source = data_source
        self.key = key
    def __len__(self):
        return len(self.data_source)
    def __iter__(self):
        return iter(sorted(list(range(len(self.data_source))), key=self.key, reversed=True))

For the training set, we want some kind of randomness on top of this. So first, we shuffle the texts and build megabatches of size `50 * bs`. We sort those megabatches by length before splitting them in 50 minibatches. That way we will have randomized batches of roughly the same length.

Then we make sure to have the biggest batch first and shuffle the order of the other batches. We also make sure the last batch stays at the end because its size is probably lower than batch size.

In [None]:
# export
class SortishSampler(Sampler):
    def __init__(self, data_source, key, bs):
        self.data_source = data_source
        self.key = key
        self.bs = bs
    
    def __len__(self) -> int:
        return len(self.data_source)
    def __iter__(self):
        idxs = torch.randperm(len(self.data_source))
        megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
        sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
        