<a href="https://colab.research.google.com/github/petitmi/Deep_learning-Sequential_data/blob/main/TEST_Padding_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## pad_sequence, pack_padded_sequence

In [5]:
import torch
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

# Variable-length sequences
seq1 = torch.tensor([1, 2, 3])
seq2 = torch.tensor([4, 5])
seq3 = torch.tensor([6, 7, 8, 9])

# List of sequences
sequences = [seq1, seq2, seq3]

# Pad sequences
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
# tensor([[1, 2, 3, 0],
#         [4, 5, 0, 0],
#         [6, 7, 8, 9]])

# Original sequence lengths
lengths = torch.tensor([len(seq) for seq in sequences])

# Sort sequences by length in descending order
sorted_lengths, sorted_indices = lengths.sort(descending=True)
sorted_padded_sequences = padded_sequences[sorted_indices]

# Pack padded sequences
packed_sequences = pack_padded_sequence(sorted_padded_sequences, sorted_lengths, batch_first=True)

# Use the packed_sequences as input to your RNN, LSTM, or GRU layer


In [14]:
sorted_padded_sequences

tensor([[6, 7, 8, 9],
        [1, 2, 3, 0],
        [4, 5, 0, 0]])

In [7]:
packed_sequences


PackedSequence(data=tensor([6, 1, 4, 7, 2, 5, 8, 3, 9]), batch_sizes=tensor([3, 3, 2, 1]), sorted_indices=None, unsorted_indices=None)

In [17]:
import torch
from torch.nn import EmbeddingBag

# Define the input data (indices for the embeddings)
seq1 = torch.tensor([1, 2, 3])
seq2 = torch.tensor([4, 5])
seq3 = torch.tensor([6, 7, 8, 9])

# List of sequences
sequences = [seq1, seq2, seq3]

# Create offsets to indicate the start of each sequence in the concatenated tensor
offsets = torch.tensor([0, len(seq1), len(seq1) + len(seq2)])

# Concatenate the sequences
concatenated_sequences = torch.cat(sequences)

# Define the EmbeddingBag layer
vocab_size = 10
embedding_dim = 4
embedding_bag = EmbeddingBag(vocab_size, embedding_dim, mode='mean')

# Compute the output (fixed-size representation for each sequence)
output = embedding_bag(concatenated_sequences, offsets)

# Output shape: (3, 4), where 3 is the batch size and 4 is the embedding dimension


In [18]:
output

tensor([[-0.3504, -0.1769, -0.0509, -0.8649],
        [-1.7669,  0.0832,  0.1542,  0.5274],
        [-0.8994, -0.1463,  0.4700,  1.0038]], grad_fn=<EmbeddingBagBackward0>)

In [24]:
import torch
from torch.nn import Embedding

# Variable-length sequences
seq1 = torch.tensor([1, 2, 3])
seq2 = torch.tensor([4, 5])
seq3 = torch.tensor([6, 7, 8, 9])

# List of sequences
sequences = [seq1, seq2, seq3]

# Define the Embedding layer
vocab_size = 10
embedding_dim = 4
embedding_layer = Embedding(vocab_size, embedding_dim)

# Compute the output (embeddings) for each sequence separately
output_seq1 = embedding_layer(seq1)  # Shape: (3, 4)
output_seq2 = embedding_layer(seq2)  # Shape: (2, 4)
output_seq3 = embedding_layer(seq3)  # Shape: (4, 4)





In [29]:
print(output_seq1)
print(output_seq2)
print(output_seq3)

tensor([[-1.7210e+00, -6.2778e-01, -4.1730e-01,  2.0924e-01],
        [ 5.1436e-01,  1.1150e-01,  6.5939e-01,  1.6387e-03],
        [-1.4139e+00,  8.8464e-02, -1.0473e+00,  2.1602e-01]],
       grad_fn=<EmbeddingBackward0>)
tensor([[-0.0276, -1.0623, -0.5539, -1.6288],
        [ 0.2982,  0.8436, -0.3670,  1.3668]], grad_fn=<EmbeddingBackward0>)
tensor([[ 0.1783, -0.3751,  1.4580,  0.7141],
        [ 0.8377, -0.0947,  0.2056,  0.5486],
        [ 1.0757,  0.0524,  0.3090,  1.8480],
        [-1.2072, -0.4872,  0.9894, -0.9489]], grad_fn=<EmbeddingBackward0>)


In [26]:
from torch.nn.utils.rnn import pad_sequence

# Pad sequences
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
# tensor([[1, 2, 3, 0],
#         [4, 5, 0, 0],
#         [6, 7, 8, 9]])

# Compute the output (embeddings) for the padded sequences
output_padded = embedding_layer(padded_sequences)  # Shape: (3, 4, 4)

In [28]:
output_padded


tensor([[[-1.7210e+00, -6.2778e-01, -4.1730e-01,  2.0924e-01],
         [ 5.1436e-01,  1.1150e-01,  6.5939e-01,  1.6387e-03],
         [-1.4139e+00,  8.8464e-02, -1.0473e+00,  2.1602e-01],
         [-3.9161e-01, -6.5614e-01, -4.1197e-01, -8.8819e-01]],

        [[-2.7625e-02, -1.0623e+00, -5.5389e-01, -1.6288e+00],
         [ 2.9815e-01,  8.4363e-01, -3.6700e-01,  1.3668e+00],
         [-3.9161e-01, -6.5614e-01, -4.1197e-01, -8.8819e-01],
         [-3.9161e-01, -6.5614e-01, -4.1197e-01, -8.8819e-01]],

        [[ 1.7830e-01, -3.7514e-01,  1.4580e+00,  7.1413e-01],
         [ 8.3771e-01, -9.4696e-02,  2.0559e-01,  5.4863e-01],
         [ 1.0757e+00,  5.2444e-02,  3.0903e-01,  1.8480e+00],
         [-1.2072e+00, -4.8723e-01,  9.8937e-01, -9.4894e-01]]],
       grad_fn=<EmbeddingBackward0>)