In [2]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from datasets import load_dataset
import re

def split_tokens(sentence):                             
    return [w for w in re.split(r" +",  re.sub(r"[^a-z@# ]", "", sentence.lower()))]   

dataset = load_dataset('emotion')
train_data = dataset['train']
all_words = []
all_labels = []
for sample in train_data:
    all_words+= split_tokens(sample['text']) 
    all_labels.append(sample['label'])

# build vocab - using vocab object of torchtext 
counter = Counter(all_words)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
my_vocab = vocab(OrderedDict(sorted_by_freq_tuples), specials=['<pad>','<unk>'])
my_vocab.set_default_index(my_vocab['<unk>'])

# count label 
num_labels = len(set(all_labels))

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/phuongnm/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
100%|██████████| 3/3 [00:00<00:00, 133.58it/s]


In [None]:
from torch.utils.data import DataLoader
import torch

def convert_sentence_to_ids(sentence, vocab):    
    word_ids = None
    # ===================================
    # REQUIREMENT:
    # - split sentence to tokens using `split_tokens` defined above
    # (tips: split sentence to list of words, then feed to the vocab to get list of id) 
    # ===================================
    # - PUSH YOUR CODE IN HERE, can not modify any code in outside this range. 
    

    
    # ===================================
    return word_ids


def get_max_sentence_length_in_batch(batch_input_ids): 
    # ===================================
    # REQUIREMENT:
    # - find and return the MAXIMUM length (number of word) of each sample (sentence) in a batch.
    # ===================================
    # - PUSH YOUR CODE IN HERE, can not modify any code in outside this range. 
     

    # ===================================
    return max_sentence_length


def add_padding(batch_input_ids, padding_id):
    max_sample_len_in_batch = get_max_sentence_length_in_batch(batch_input_ids=batch_input_ids)

    # ===================================
    # REQUIREMENT:
    # - batch data contains many sentence having difference number of words. To train a deep learning model
    #   we need to convert it to tensor which have the same length for all sentences. 
    # - We need to add padding into each sentence (sample) in a batch. 
    # - for example: a batch contains [[1,2,3,4],[6,7,8],[9]] ==(after padding 0)==> [[1,2,3,4],[6,7,8,0],[9,0,0,0]]
    # (tips: each sample, calculate the number of padding tokens need to add to get max_sample_len_in_batch) 
    # ===================================
    # - PUSH YOUR CODE IN HERE, can not modify any code in outside this range.  


    # ===================================
    return padded_word_ids


class BatchPreprocessor(object):
    def __init__(self, vocab):
        self.vocab = vocab 

    def __call__(self, batch):
        inputs = []
        masks = []

        # covert text to number 
        for sample in batch:
            word_ids = convert_sentence_to_ids(sample['text'], self.vocab)
            inputs.append(word_ids)
        
        # padding to create a tensor input - make all sentence having the same length 
        padding_id = self.vocab["<pad>"]
        padded_batch = add_padding(batch_input_ids=inputs, padding_id=padding_id)

        # label processing 
        labels = []
        for sample in batch:
            label = sample['label']
            labels.append(int(label))

        # make a tensor 
        inputs = torch.LongTensor(padded_batch)

        # make mask flag tensor
        masks = inputs == padding_id

        return (inputs, torch.LongTensor(labels), torch.BoolTensor(masks)) 

batch_size = 5

# dataset_example should support operator index_selection for create the data_loader object
test_loader = DataLoader(dataset['test'], batch_size=batch_size, collate_fn=BatchPreprocessor(my_vocab), shuffle=True)
for e in test_loader:
    print('First epoch data:')
    print('input data\n', e[0])
    print('label data\n',e[1])
    print('padding mask data\n',e[2])
    break 

### The correct output look like

```python
First epoch data:
input data
 tensor([[    2,    21,   700,     5,   113,    54,    13,    65,    28,    14,
            49,     2,   411,    71,    10,    11,   321,   173,    19,    13,
            99,   145,    41,   848,     4,     2,    40,   118,   165,     3,
            77,   385,  1112,     5,    64,    37,    62,    44,    11,   173],
        [    2,     3,    15,   714,     4,   154,     5,   644,    54,    22,
            61,   200,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,   252,    13,   818,    43,   140,     9,     2,   141,   586,
            30,  1015,  1217,     4,    20,  1856,   615,    11,    59,     5,
          1873,    11,  2544,     5,   575,    12,    19,    61,   100,    12,
             6,  1226,   123,   272,     0,     0,     0,     0,     0,     0],
        [    2,   747,     5,    39,    15,    45,     1,    77, 10083,     4,
            60,     9, 10083,  1658,     2,     3,     9,     6,   134,   591,
          6395,     2,    65,    14,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,   145,    48,     3,    23, 10265,    23,   679,    12,    78,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
label data
 tensor([0, 1, 5, 1, 0])
padding mask data
 tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True],
        [False, False, False, False, False, False, False, False, False, False,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]])
```