<a href="https://colab.research.google.com/github/paulxiong/tinyTF/blob/main/multi_words_mask_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code randomly selects a percentage of tokens to mask, replaces them with [MASK], and then generates a masked word activity matrix using a learned feature creation matrix. The resulting matrix represents all possible pairs of words that occur with each token in the input sequence and is used during self-attention to ensure that each token can only attend to previous non-masked tokens in the sequence.

In [6]:
import torch

# Define the input sequence and the percentage of tokens to mask
input_sequence = "the quick brown fox jumps over the lazy dog"
mask_percentage = 0.15

# Split the input sequence into individual tokens
tokens = input_sequence.split()

# Determine how many tokens to mask based on the mask percentage
num_to_mask = int(len(tokens) * mask_percentage)

# Randomly select which tokens to mask
mask_indices = torch.randperm(len(tokens))[:num_to_mask]

# Replace the selected tokens with [MASK]
for i in mask_indices:
    tokens[i] = "[MASK]"

# Define a feature creation matrix that maps each word to a vector representation
feature_creation_matrix = torch.randn((len(tokens), 2))


In [7]:
tokens

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', '[MASK]']

In [8]:
feature_creation_matrix

tensor([[ 1.3031,  3.8370],
        [-0.9417, -0.1686],
        [-0.4303,  0.3668],
        [-0.1556, -1.4749],
        [ 0.4633, -1.5158],
        [-0.8226, -0.7534],
        [ 1.3214,  0.6392],
        [-0.3986, -1.6292],
        [-1.5081, -1.8666]])

In [9]:
# Initialize an empty masked word activity matrix
masked_word_activity_matrix = torch.zeros((len(tokens), len(tokens)))
masked_word_activity_matrix

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:


# Iterate over each token in the sequence and generate features for all possible pairs of words that occur with that token
for i, token in enumerate(tokens):
    # Convert the token to its corresponding vector representation using the feature creation matrix
    token_vector = feature_creation_matrix[i]
    print(f"token_vector= {token_vector}\ntoken_vector.unsequeeze= {token_vector.unsqueeze(1)}\ntoken_vector.sequeeze= {token_vector.unsqueeze(1).squeeze()}")

    # Multiply the token vector by the feature creation matrix to obtain a new vector that represents all possible pairs of words that occur with that word
    pair_vectors = torch.matmul(feature_creation_matrix, token_vector.unsqueeze(1)).squeeze()
    print(f"pair_vectors= {pair_vectors} \ntorch.matmul= {torch.matmul(feature_creation_matrix, token_vector.unsqueeze(1))}")
    # Set any entries corresponding to [MASK] tokens or future tokens to -infinity so they are not considered during self-attention
    pair_vectors[:i] = float('-inf')
    pair_vectors[i+1:] = float('-inf')
    print(f"pair_vectors=: {pair_vectors}")

    # Apply a softmax function to obtain a probability distribution over all possible pairs of words for this token
    attention_scores = torch.softmax(pair_vectors, dim=0)
    print(f'attention_scores= {attention_scores}')

    # Store the attention scores in the m asked word activity matrix
    masked_word_activity_matrix[i] = attention_scores
    print(f'masked_word_activity_matrix= {masked_word_activity_matrix}')

# Print the resulting masked word activity matrix
print(masked_word_activity_matrix)

token_vector= tensor([1.3031, 3.8370])
token_vector.unsequeeze= tensor([[1.3031],
        [3.8370]])
token_vector.sequeeze= tensor([1.3031, 3.8370])
pair_vectors= tensor([16.4209, -1.8742,  0.8468, -5.8622, -5.2125, -3.9628,  4.1747, -6.7708,
        -9.1272]) 
torch.matmul= tensor([[16.4209],
        [-1.8742],
        [ 0.8468],
        [-5.8622],
        [-5.2125],
        [-3.9628],
        [ 4.1747],
        [-6.7708],
        [-9.1272]])
pair_vectors=: tensor([16.4209,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
           -inf])
attention_scores= tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
masked_word_activity_matrix= tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 