<a href="https://colab.research.google.com/github/nazzang49/boost-camp-projects/blob/main/assignments/(P02)Padding_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Practice Padding in NLP
- (URL) https://wikidocs.net/83544
- (REF) https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html

In [6]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch

In [7]:
# pad_sequence example
a = torch.rand(25, 300)
b = torch.rand(15, 300)
c = torch.rand(30, 300)
d = pad_sequence([a, b, c], batch_first=True)
d

tensor([[[0.2821, 0.0819, 0.3674,  ..., 0.3741, 0.3288, 0.4672],
         [0.3820, 0.3417, 0.4454,  ..., 0.2695, 0.6993, 0.6052],
         [0.0491, 0.8702, 0.8983,  ..., 0.2812, 0.7589, 0.3905],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.8623, 0.6030, 0.9990,  ..., 0.8997, 0.2210, 0.6327],
         [0.4806, 0.1996, 0.7181,  ..., 0.1942, 0.3888, 0.3936],
         [0.9138, 0.0201, 0.1585,  ..., 0.1650, 0.5211, 0.4323],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.4232, 0.0578, 0.3817,  ..., 0.6019, 0.2982, 0.0140],
         [0.2613, 0.6804, 0.5823,  ..., 0.3885, 0.2454, 0.4629],
         [0.4679, 0.3488, 0.8611,  ..., 0.4941, 0.5971, 0.

In [None]:
# pad => packed with unsorted
seq = torch.tensor([[1, 2, 0], [3, 0, 0], [4, 5, 6]])
lens = [2, 1, 3]
unsorted_packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)
print(unsorted_packed)

PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]), sorted_indices=tensor([2, 0, 1]), unsorted_indices=tensor([1, 2, 0]))


In [None]:
# pad => packed with sorted
seq = torch.tensor([[4, 5, 6], [1, 2, 0], [3, 0, 0]])
lens = [3, 2, 1]
sorted_packed = pack_padded_sequence(seq, lens, batch_first=True) # if sorted=True => should order lens in descending
print(sorted_packed)

PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)


In [None]:
# packed => pad (sorted vs unsorted is diff)
seq_unpacked, lens_unpacked = pad_packed_sequence(unsorted_packed, batch_first=True)
print(seq_unpacked)
print(lens_unpacked)

tensor([[1, 2, 0],
        [3, 0, 0],
        [4, 5, 6]])
tensor([2, 1, 3])


## Numpy

In [1]:
sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]
sentences

[['barber', 'person'],
 ['barber', 'good', 'person'],
 ['barber', 'huge', 'person'],
 ['knew', 'secret'],
 ['secret', 'kept', 'huge', 'secret'],
 ['huge', 'secret'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'word'],
 ['barber', 'kept', 'secret'],
 ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
 ['barber', 'went', 'huge', 'mountain']]

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# word to index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
encoded = tokenizer.texts_to_sequences(sentences)
encoded

In [None]:
tmp_list = []
for s in encoded:
    tmp_list.append(torch.tensor(s))
tmp_list

In [16]:
# zero padding based on max length
padded_result = pad_sequence(tmp_list, batch_first=True)
padded_result

tensor([[ 1,  5,  0,  0,  0,  0,  0],
        [ 1,  8,  5,  0,  0,  0,  0],
        [ 1,  3,  5,  0,  0,  0,  0],
        [ 9,  2,  0,  0,  0,  0,  0],
        [ 2,  4,  3,  2,  0,  0,  0],
        [ 3,  2,  0,  0,  0,  0,  0],
        [ 1,  4,  6,  0,  0,  0,  0],
        [ 1,  4,  6,  0,  0,  0,  0],
        [ 1,  4,  2,  0,  0,  0,  0],
        [ 7,  7,  3,  2, 10,  1, 11],
        [ 1, 12,  3, 13,  0,  0,  0]])

In [23]:
len_list = []
for p in padded_result:
    cnt = 0
    for i in p:
        if i != 0:
           cnt += 1
    len_list.append(cnt)
len_list

[2, 3, 3, 2, 4, 2, 3, 3, 3, 7, 4]

In [21]:
# packing
packed_sequence = pack_padded_sequence(padded_result, len_list, batch_first=True, enforce_sorted=False)
packed_sequence

PackedSequence(data=tensor([ 7,  2,  1,  1,  1,  1,  1,  1,  1,  9,  3,  7,  4, 12,  8,  3,  4,  4,
         4,  5,  2,  2,  3,  3,  3,  5,  5,  6,  6,  2,  2,  2, 13, 10,  1, 11]), batch_sizes=tensor([11, 11,  8,  3,  1,  1,  1]), sorted_indices=tensor([ 9,  4, 10,  1,  2,  6,  7,  8,  0,  3,  5]), unsorted_indices=tensor([ 8,  3,  4,  9,  1, 10,  5,  6,  7,  0,  2]))

In [24]:
# unpacking
unpacked_sequence, len_list = pad_packed_sequence(packed_sequence, batch_first=True)
print(unpacked_sequence)
print(len_list)

tensor([[ 1,  5,  0,  0,  0,  0,  0],
        [ 1,  8,  5,  0,  0,  0,  0],
        [ 1,  3,  5,  0,  0,  0,  0],
        [ 9,  2,  0,  0,  0,  0,  0],
        [ 2,  4,  3,  2,  0,  0,  0],
        [ 3,  2,  0,  0,  0,  0,  0],
        [ 1,  4,  6,  0,  0,  0,  0],
        [ 1,  4,  6,  0,  0,  0,  0],
        [ 1,  4,  2,  0,  0,  0,  0],
        [ 7,  7,  3,  2, 10,  1, 11],
        [ 1, 12,  3, 13,  0,  0,  0]])
tensor([2, 3, 3, 2, 4, 2, 3, 3, 3, 7, 4])
