In [1]:
import random
import string

# Set seeds for reproducibility
random.seed(52)

# Define possible repeat lengths (e.g., 'aaa', 'bbbb', 'ccccccc')
repeats_range = [2, 2, 2, 2]

# Function to generate `text8` with a specified length and then apply random masks
def generate_text8_with_and_without_masks(target_length, mask_token="?"):
    text8 = []  # Start with an empty list to store sequences
    
    # Generate text sequence without any masks
    while len(' '.join(text8)) < target_length:
        for char in string.ascii_lowercase:
            repeat_count = random.choice(repeats_range)
            sequence = ' '.join([char] * repeat_count)
            text8.append(sequence)
            
            if len(' '.join(text8)) >= target_length:
                break
    
    # Join and trim to the exact target length
    unmasked_text8_str = ' '.join(text8)[:target_length]
    
    # Split the sequence into tokens and randomly mask 20% of them
    tokens = unmasked_text8_str.split()
    num_masks = int(len(tokens) * 0.2)
    mask_indices = random.sample(range(len(tokens)), num_masks)
    
    # Create a copy of tokens for the masked version
    masked_tokens = tokens[:]
    
    for idx in mask_indices:
        masked_tokens[idx] = mask_token
    
    # Join tokens back into strings
    masked_text8_str = ' '.join(masked_tokens)
    
    return unmasked_text8_str, masked_text8_str

# Generate `text8` with a target length of 50000
unmasked_text8, masked_text8 = generate_text8_with_and_without_masks(target_length=50000)

In [20]:
unmasked_text8

'a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p q q r r s s t t u u v v w w x x y y z z a a b b c c d d e e f f g g h h i i j j k k l l m m n n o o p p

# Model 

In [None]:
import torch

class MultiHeadAttentionLayer(torch.nn.Module):
    def __init__(self, emb_dim, num_heads, hidden_dim_ff):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads  # Dimension per head
        assert emb_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads"
        
        self.linear_q = torch.nn.Linear(emb_dim, emb_dim)
        self.linear_k = torch.nn.Linear(emb_dim, emb_dim)
        self.linear_v = torch.nn.Linear(emb_dim, emb_dim)
        
        # Learnable bias for attention
        self.attn_embedding_bias = torch.nn.Parameter(torch.zeros(emb_dim))
        
        # Feedforward layer (two linear layers with ReLU in between)
        self.feedforward = torch.nn.Sequential(
            torch.nn.Linear(emb_dim, emb_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(emb_dim, emb_dim)
        )

    def forward(self, emb,):
        batch_size = emb.size(0)

        #sinusoidal postional embedding
        # sin cos matri

        # Transform embeddings for query, key, and value, then reshape for multi-head attention
        query = self.linear_q(emb).view(batch_size, self.num_heads, self.head_dim).transpose(0, 1)
        key = self.linear_k(emb).view(batch_size, self.num_heads, self.head_dim).transpose(0, 1)
        value = self.linear_v(emb).view(batch_size, self.num_heads, self.head_dim).transpose(0, 1)

        # Calculate attention scores and apply softmax
        scaling_factor = self.head_dim ** 0.5
        similarity_matrix = torch.matmul(query, key.transpose(-2, -1)) / scaling_factor
        soft_matrix = similarity_matrix

        # Apply attention weights to values and reshape back
        attention = torch.matmul(soft_matrix, value).transpose(0, 1).contiguous()
        attention = attention.view(batch_size, -1)  # Combine heads back to [batch_size, emb_dim]

        # Apply upper triangular mask (if required for causality)

        base = torch.full_like(attention, float("-inf"))
        mask = torch.triu(base, diagonal=1)
        attention = attention + mask

        # Apply feedforward layer
        output = self.feedforward(attention)
        
        return output

ModuleNotFoundError: No module named 'torch'

In [None]:
class StackedAttentionModel(torch.nn.Module):
    def __init__(self, voc_size, emb_dim, num_heads, num_layers):
        super().__init__()
        
        self.num_layers = num_layers
        self.emb = torch.nn.Embedding(num_embeddings=voc_size, embedding_dim=emb_dim)
        self.ffw = torch.nn.Linear(emb_dim, voc_size, bias=False)
        self.softmax = torch.nn.Softmax(dim=1)
        
        # Create a list of attention layers
        self.attn_layers = torch.nn.ModuleList([MultiHeadAttentionLayer(emb_dim, num_heads) for _ in range(num_layers)])

    def forward(self, inpt):
        emb = self.emb(inpt)  # Shape: [batch_size, seq_len, emb_dim]

        # Pass through the stacked attention layers
        for attn_layer in self.attn_layers:
            emb = attn_layer(emb)  # Update embeddings with the output of the attention layer
        
        # After passing through all attention layers, apply feedforward layer
        out = self.ffw(emb)  # Shape: [batch_size, seq_len, voc_size]
        out = self.softmax(out)
        return out

In [18]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
args = (50000, 64, 8,12)  # Example arguments (vocab size, embedding size, num heads)
mFoo = StackedAttentionModel(*args)

# Move the model to the device
mFoo = mFoo.to(device)

# Print the number of parameters
print('mFoo', sum(p.numel() for p in mFoo.parameters()))

# Set up the optimizer
opFoo = torch.optim.Adam(mFoo.parameters(), lr=0.003)

mFoo 6650368
