In [None]:
# default_exp data

# Data

> How to prepare dataset for our experiments?

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import torchtext

from inspect import signature
from fastai.text.all import *
from sklearn.feature_extraction.text import CountVectorizer


### How to generate text pairs using Fast.AI library?

- We want to show how we could use the low-level API provided by Fast.AI to build a custom dataset for our task.
- We want to identify if a pair of text are duplicate of each other or not.
- The data format would be something like ((t1, t2), label), where t1 and t2 represent the text and label represents a boolean variable indicating whether they are duplicate or not.
- We will make use of the transforms provided by Fast.AI to build the dataset and dataloaders required by our model to process.

In [None]:
class SentenceSimTuple(fastuple):
    def show(self, ctx=None, **kwargs):
        df = pd.DataFrame({'a': [self[0]],
                           'b': [self[1]]
                          })
        display_df(df)

In [None]:
class TextPairGetter(ItemTransform):
    def __init__(self, s1='a', s2='b',target='target'):
        store_attr('s1,s2,target')
    def encodes(self, o): 
        return o[self.s1], o[self.s2]

In [None]:
class BOWVectorizer(ItemTransform):
    def __init__(self, vec):
        store_attr('vec')
    
    def encodes(self, o):
        ftok = self.vec.transform(np.array([o[0]]))
        stok = self.vec.transform(np.array([o[1]]))
        return ftok.toarray() * 1., stok.toarray() * 1.
    
    def decodes(self, o):
        forig = self.vec.inverse_transform(o[0])
        sorig = self.vec.inverse_transform(o[1])
        return SentenceSimTuple((TitledStr(' '.join(forig[0])), TitledStr(' '.join(sorig[0]))))

In [None]:
sample = pd.DataFrame({'a': ['this is a good life', 'slow life', 'am i good', 'waiting for'],
                       'b': ['take it easy', 'I am on the moon, rythm is right.', 'truely madly', 'for waiting'],
                       'target': [0, 1, 0, 1]
                      })

In [None]:
vec  = CountVectorizer()
vec  = vec.fit(sample['a'].tolist() + sample['b'].tolist())
dset = Datasets(sample, [[TextPairGetter(), BOWVectorizer(vec)], [ItemGetter('target'), Categorize()]])

In [None]:
x, y = dset.decode(dset[0])

In [None]:
show_at(dset, 1)

Unnamed: 0,a,b
0,life slow,am is moon on right rythm the


1


In [None]:
dls = dset.dataloaders(bs=2)

In [None]:
dls._types

{tuple: [{tuple: [torch.Tensor, torch.Tensor]},
  fastai.torch_core.TensorCategory]}

In [None]:
x, y = dls.one_batch()

### How to override show_batch method for our dataset?

In [None]:
@typedispatch
def show_batch(x:tuple, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if isinstance(samples[0][0], tuple):
        samples = L((*s[0], *s[1:]) for s in samples)
        if trunc_at is not None: samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at), *s[2:]) for s in samples)
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))

In [None]:
dls.show_batch()

Unnamed: 0,text,text_,category
0,am good,madly truely,0
1,for waiting,for waiting,1


## Quora Questions Pair Dataset

In [None]:
BASE_DIR      = Path('~/data/dl_nlp')
RAW_DATA_PATH = BASE_DIR / 'data' / 'quodup'


train       = pd.read_csv(RAW_DATA_PATH / 'train.csv')
train       = train.sample(frac=1.)
train.index = np.arange(len(train))

# fill empty questions with ''
train.loc[:, 'question1'] = train.question1.fillna('')
train.loc[:, 'question2'] = train.question2.fillna('')

train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,2751,5459,5460,What is the reason behind having one small testicle and b=one bigger comparatively?,What is the reason behind having one small testicle and other is bigger comparatively?,1
1,129269,207694,207695,How do I build up a readership for my blog?,What are some ways of gauging the readership of a blog?,0
2,157094,245715,245716,Should company provide a signed job offer letter or email job offer should be fine?,Will IT companies call me for sure if I have offer letter provided by them?,0
3,111342,182406,182407,Can a F1 student visa holder become a Uber driver in USA?,Can I drive for Uber with my H1-B work visa or F1 student visa in the US?,1
4,157746,246570,246571,When can we expect APPSC Group-I/II notification?,What is the ratio for APPSC AE notification?,0


In [None]:
#slow
%%time

splits      = IndexSplitter(np.arange(len(train)-int(.2 * len(train)), len(train)))(train)
combined_df = pd.DataFrame({'text': list(train.iloc[splits[0]]['question1'].unique()) + list(train.iloc[splits[0]]['question2'].unique())})
_, cnt      = tokenize_df(combined_df, text_cols='text')

CPU times: user 16.7 s, sys: 3.14 s, total: 19.9 s
Wall time: 55.2 s


In [None]:
#export
class NumericalizePair(Numericalize):
    def encodes(self, o): 
        return TensorText(tensor([self.o2i  [o_] for o_ in o['q1']])), TensorText(tensor([self.o2i  [o_] for o_ in o['q2']])) 

In [None]:
%%time
dset = Datasets(train, [[Tokenizer.from_df('question1', tok_text_col='q1'), Tokenizer.from_df('question2', tok_text_col='q2'), 
                          NumericalizePair(vocab=list(cnt.keys()))], [ItemGetter('is_duplicate'), Categorize()]], splits=splits)

CPU times: user 1min 27s, sys: 7.18 s, total: 1min 35s
Wall time: 2min 32s


In [None]:
dset[0]

((TensorText([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  8, 14, 15, 16]),
  TensorText([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11, 590,   3,
           14,  15,  16])),
 TensorCategory(1))

In [None]:
dset.decode(dset[0])

(('x x b o s   x x m a j   w h a t   i s   t h e   r e a s o n   b e h i n d   h a v i n g   o n e   s m a l l   t e s t i c l e   a n d   b   =   o n e   b i g g e r   c o m p a r a t i v e l y   ?',
  'x x b o s   x x m a j   w h a t   i s   t h e   r e a s o n   b e h i n d   h a v i n g   o n e   s m a l l   t e s t i c l e   a n d   o t h e r   i s   b i g g e r   c o m p a r a t i v e l y   ?'),
 '1')

In [None]:
#export
class Pad_Chunk_Pair(ItemTransform):
    "Pad `samples` by adding padding by chunks of size `seq_len`"
    def __init__(self, pad_idx=1, pad_first=True, seq_len=72,decode=True,**kwargs):
        store_attr('pad_idx, pad_first, seq_len,seq_len')
        super().__init__(**kwargs)
    def before_call(self, b):
        "Set `self.max_len` before encodes"
        xas, xbs = [], []
        for xs in b:
            xa, xb = xs[0]
            if isinstance(xa, TensorText):
                xas.append(xa.shape[0])
            if isinstance(xb, TensorText):
                xbs.append(xb.shape[0])
        
        self.max_len_a = max(xas)
        self.max_len_b = max(xbs)
        
    def __call__(self, b, **kwargs):
        self.before_call(b)
        return super().__call__(tuple(b), **kwargs)
    
    def encodes(self, batch):
        texts  = ([s[0][0] for s in batch], [s[0][1] for s in batch])
        labels = default_collate([s[1:] for s in batch])
        
        inps   = {}
        
        pa = default_collate([pad_chunk(ta,pad_idx=self.pad_idx, pad_first=self.pad_first, seq_len=self.seq_len, pad_len=self.max_len_a) for ta in texts[0]])
        pb = default_collate([pad_chunk(tb,pad_idx=self.pad_idx, pad_first=self.pad_first, seq_len=self.seq_len, pad_len=self.max_len_b) for tb in texts[1]])
        
        inps['pa'] = pa
        inps['pb'] = pb
        
        if len(labels):
            inps['labels'] = labels[0]
        
        res = (inps, )
        
        return res

In [None]:
#export
class Undict(Transform):
    def decodes(self, x:dict):
        if 'pa' in x and 'pb' in x: res = (x['pa'], x['pb'], x['labels'])
        return res

In [None]:
seq_len    = 72
dls_kwargs = {
              'before_batch': Pad_Chunk_Pair(seq_len=seq_len),
              'after_batch': Undict(),
              'create_batch': fa_convert
             }

dls        = dset.dataloaders(bs=128, seq_len=seq_len, **dls_kwargs)

In [None]:
x = dls.one_batch()

In [None]:
x

({'pa': TensorText([[  0,   1,   2,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1,  38,  ...,   1,   1,   1],
          ...,
          [  0,   1, 189,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1, 170,  ...,   1,   1,   1]], device='cuda:0'),
  'pb': TensorText([[  0,   1,  18,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1,  18,  ...,   1,   1,   1],
          ...,
          [  0,   1, 189,  ...,   1,   1,   1],
          [  0,   1,  38,  ...,   1,   1,   1],
          [  0,   1, 170,  ...,   1,   1,   1]], device='cuda:0'),
  'labels': TensorCategory([1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
          1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
          0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
          0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,

In [None]:
xd = dls.decode(x)
xd

((TensorText([[  0,   1,   2,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1,  38,  ...,   1,   1,   1],
          ...,
          [  0,   1, 189,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1, 170,  ...,   1,   1,   1]]),
  TensorText([[  0,   1,  18,  ...,   1,   1,   1],
          [  0,   1,  17,  ...,   1,   1,   1],
          [  0,   1,  18,  ...,   1,   1,   1],
          ...,
          [  0,   1, 189,  ...,   1,   1,   1],
          [  0,   1,  38,  ...,   1,   1,   1],
          [  0,   1, 170,  ...,   1,   1,   1]]),
  TensorCategory([1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
          1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
          0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
          0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
          1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 

In [None]:
#export
def load_dataset():
    BASE_DIR      = Path('~/data/dl_nlp')
    RAW_DATA_PATH = BASE_DIR / 'data' / 'quodup'


    train       = pd.read_csv(RAW_DATA_PATH / 'train.csv')
    train       = train.sample(frac=1.)
    train.index = np.arange(len(train))

    # fill empty questions with ''
    train.loc[:, 'question1'] = train.question1.fillna('')
    train.loc[:, 'question2'] = train.question2.fillna('')
    
    return train