In [1]:
import joblib
import os

from torchtext.data import Field, BucketIterator
from torchtext.data import TabularDataset
import torch

import pandas as pd
from tqdm import tqdm_notebook as tqdm
import re
from torchtext.data import Iterator, BucketIterator

# Config

In [2]:
development = True
use_cuda = True
sample_data_folder = r'cnndm-pj/'

MIN_LEN_X = 10
MIN_LEN_Y = 10
MAX_LEN_X = 400
MAX_LEN_Y = 100
MIN_NUM_X = 1
MAX_NUM_X = 1
MAX_NUM_Y = None
W_LS = "<s>"
W_RS = "</s>"
SUMM_BEGIN_TOKEN = r"-lrb- .* -rrb-"
PRETRAINED_VECTOR = 'glove.6B.200d' 

In [3]:
device = (
        torch.device("cuda")
        if use_cuda and torch.cuda.is_available()
        else torch.device("cpu")
    )

# Using TorchText to load the data

In [9]:
TEXT = Field(tokenize = "spacy",
            tokenizer_language="en_core_web_sm",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

In [90]:
%%time
tv_datafields = [("article", TEXT), 
                 ("summary", TEXT)]
trn, vld = TabularDataset.splits(
               path="cnndm-pj", # the root directory where the data lies
               train='train_processed.txt', validation="val_processed.txt",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields,
                csv_reader_params ={'delimiter':'|'})

tst_datafields = [("article", TEXT), 
                 ("summary", TEXT)]
tst = TabularDataset(
           path="cnndm-pj/test_processed.txt", # the file path
           format='csv',
           skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields,
                csv_reader_params ={'delimiter':'|'})

Wall time: 2min 51s


In [91]:
%%time
#Removing incomplete examples, with no article or summary
count = 0
for ex in trn.examples:
    if len(ex.__dict__.keys())!=2:
        trn.examples.remove(ex)
        count = count+1
print ("Removed {} samples from Train".format(count))
count =0
for ex in vld.examples:
    if len(ex.__dict__.keys())!=2:
        vld.examples.remove(ex)
        count = count +1
print ("Removed {} samples from Validation".format(count))
count = 0
for ex in tst.examples:
    if len(ex.__dict__.keys())!=2:
        tst.examples.remove(ex)
        count = count+1
print ("Removed {} samples from Test".format(count))

Removed 0 samples from Train
Removed 0 samples from Validation
Removed 0 samples from Test
Wall time: 107 ms


In [92]:
%%time
#Building vocabulary on train only

TEXT.build_vocab(trn, max_size=50000, min_freq=2,vectors=PRETRAINED_VECTOR)

Wall time: 5.77 s


In [93]:
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.article), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [100]:
for batch in train_iter:
    break

In [113]:
class BucketDataLoader(BucketIterator):
    def __init__(self, x_var, y_var, **kwargs):
        self.x_var = x_var
        self.y_var = y_var
        super(BucketDataLoader, self).__init__(**kwargs)
        
    def batch(data, batch_size, batch_size_fn=None):
        minibatch = super().batch(data, batch_size, batch_size_fn=batch_size_fn)
        # we assume only one input in this wrapper
        yield  getattr(minibatch, self.x_var), getattr(minibatch, self.y_var) 

In [115]:
train_iter = BucketDataLoader(dataset = trn, x_var = 'article', y_var='summary' ,batch_size=64,device = device, sort_key=lambda x: len(x.article), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [117]:
val_iter = BucketDataLoader(dataset = vld, x_var = 'article', y_var='summary' ,batch_size=64,device = device, sort_key=lambda x: len(x.article), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [94]:
class BatchWrapper:
        def __init__(self, dl, x_var, y_var):
            self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x 

        def __iter__(self):
            for batch in self.dl:
                x = getattr(batch, self.x_var) # we assume only one input in this wrapper
                y = getattr(batch, self.y_var) # we assume only one input in this wrapper
                yield (x, y)

        def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(dl=train_iter, x_var = "article", y_var = "summary")
valid_dl = BatchWrapper(val_iter, x_var = "article", y_var = "summary")
test_dl = BatchWrapper(test_iter, x_var = "article", y_var = "summary")

In [116]:
next(train_dl.__iter__())

(tensor([[   2,    2,    2,  ...,    2,    2,    2],
         [ 788,   33, 1391,  ..., 5702,   55,   33],
         [2106,    4,    0,  ..., 5494,  254,    4],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]),
 tensor([[   2,    2,    2,  ...,    2,    2,    2],
         [  20,   20,   20,  ...,   20,   20,   20],
         [ 460,    4, 1391,  ...,  128,   55,    4],
         ...,
         [   1,  483,    1,  ...,    1,    1,    1],
         [   1,  362,    1,  ...,    1,    1,    1],
         [   1,    3,    1,  ...,    1,    1,    1]]))

In [74]:
TEXT.vocab.itos[2]

'<sos>'

In [75]:
next(train_dl.__iter__())[0].shape

torch.Size([416, 64])