In [4]:
import torch

torch.manual_seed(1337)

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

In [51]:
xbow_og = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1]
        xbow_og[b, t] = torch.mean(xprev, 0)

In [None]:
xbow = torch.tril(torch.ones(T, T))
wei = torch.tril(torch.ones(T, T))

xbow = xbow @ x / wei.sum(1, keepdim=True)
torch.allclose(xbow, xbow_og)

In [None]:
import torch.nn.functional as F

wei = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(wei == 0, float("-inf"))
print(wei.shape)
F.softmax(wei, dim=-1)

# Self Attention

In [133]:
text =  """
    Manual implementation of batch normalization for 1D case
    x: input tensor 
    - For 2D input (N, C): normalize across batch dimension N
    - For 3D input (N, C, L): normalize across batch N and length L dimensions
    gamma: learnable scale parameter
    beta: learnable shift parameter  
    eps: small constant for numerical stability
    """

stoi = {s: i for i, s in enumerate(set(list(str(text))))}
itos = {i:s for s, i in stoi.items()}

In [None]:
torch.manual_seed(42)

def manual_batch_norm_1d(x, gamma=None, beta=None, eps=1e-5):
    """
    Manual implementation of batch normalization for 1D case
    x: input tensor 
    - For 2D input (N, C): normalize across batch dimension N
    - For 3D input (N, C, L): normalize across batch N and length L dimensions
    gamma: learnable scale parameter
    beta: learnable shift parameter  
    eps: small constant for numerical stability
    """
    
    if x.dim() == 2:
        # For 2D input (N, C), normalize across batch dimension (dim=0)
        mean = torch.mean(x, dim=0, keepdim=True)
        var = torch.var(x, dim=0, keepdim=True, unbiased=False)
    elif x.dim() == 3:
        # For 3D input (N, C, L), normalize across batch and length dimensions (dim=(0,2))
        mean = torch.mean(x, dim=(0, 2), keepdim=True)
        var = torch.var(x, dim=(0, 2), keepdim=True, unbiased=False)
    else:
        raise ValueError(f"Expected 2D or 3D input, got {x.dim()}D")
    
    # Normalize: (x - mean) / sqrt(var + eps)
    x_normalized = (x - mean) / torch.sqrt(var + eps)
    
    # Scale and shift (if parameters provided)
    if gamma is not None:
        x_normalized = x_normalized * gamma
    if beta is not None:
        x_normalized = x_normalized + beta
        
    return x_normalized, mean, var

print("=== Problem with original implementation ===")
# Your original approach - WRONG for 3D input
x = torch.tensor([[[1, 2],
                           [3, 4]],

                          [[5, 6],
                           [7, 8]]], dtype=torch.float32)
print(f"Input shape: {x.shape}")
print(f"Input tensor:\n{x}")

# # Wrong way (your original) - only normalizes across batch dim
mean = torch.mean(x, dim=[0, 2], keepdim=True)
print(mean)

var = torch.var(x, dim=[0, 2], keepdim=True, unbiased=False)
x_wrong = (x - mean) / torch.sqrt(var + 1e-5)
print(x_wrong)
bn = torch.nn.BatchNorm1d(2, track_running_stats=False)
bn(x)


In [None]:
# Simple Batch Norm Test Example
import torch
import torch.nn as nn

def test_batch_norm():
    """Test batch normalization with a simple example"""
    
    # Create test data: (batch_size, features)
    torch.manual_seed(42)
    batch_size, num_features = 8, 4
    x = torch.randn(batch_size, num_features) * 3 + 2  # Random data with mean~2, std~3
    
    print("Original data:")
    print(f"Shape: {x.shape}")
    print(f"Mean per feature: {x.mean(dim=0)}")
    print(f"Std per feature: {x.std(dim=0, unbiased=False)}")
    print()
    
    # Manual batch norm
    def manual_batch_norm(x, eps=1e-5):
        mean = x.mean(dim=0, keepdim=True)
        var = x.var(dim=0, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + eps)
        return x_norm, mean, var
    
    # Apply manual batch norm
    x_manual, mean, var = manual_batch_norm(x)
    
    print("After manual batch norm:")
    print(f"Mean per feature: {x_manual.mean(dim=0)}")
    print(f"Std per feature: {x_manual.std(dim=0, unbiased=False)}")
    print()
    
    # PyTorch's BatchNorm1d
    bn_layer = nn.BatchNorm1d(num_features, affine=False)  # No learnable params for fair comparison
    bn_layer.eval()  # Use batch statistics, not running stats
    bn_layer.train()  # Actually, use training mode to compute batch stats
    
    x_pytorch = bn_layer(x)
    
    print("PyTorch BatchNorm1d:")
    print(f"Mean per feature: {x_pytorch.mean(dim=0)}")
    print(f"Std per feature: {x_pytorch.std(dim=0, unbiased=False)}")
    print()
    
    # Check if they're close
    print(f"Manual and PyTorch results are close: {torch.allclose(x_manual, x_pytorch, atol=1e-6)}")
    
    return x, x_manual, x_pytorch

# Run the test
x_orig, x_manual, x_pytorch = test_batch_norm()


In [None]:
# Test Batch Norm with Learnable Parameters (Affine Transform)
def test_batch_norm_with_params():
    """Test batch norm with gamma (scale) and beta (shift) parameters"""
    
    torch.manual_seed(123)
    batch_size, num_features = 6, 3
    x = torch.randn(batch_size, num_features) * 2 + 1
    
    print("Testing Batch Norm with learnable parameters:")
    print(f"Input shape: {x.shape}")
    print(f"Input mean per feature: {x.mean(dim=0)}")
    print(f"Input std per feature: {x.std(dim=0, unbiased=False)}")
    print()
    
    # Manual batch norm with learnable parameters
    def batch_norm_with_params(x, gamma, beta, eps=1e-5):
        # Normalize to mean=0, std=1
        mean = x.mean(dim=0, keepdim=True)
        var = x.var(dim=0, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + eps)
        
        # Apply learnable affine transformation
        x_out = gamma * x_norm + beta
        return x_out
    
    # Define learnable parameters
    gamma = torch.tensor([2.0, 0.5, 1.5])  # Scale factors
    beta = torch.tensor([1.0, -1.0, 0.0])   # Shift factors
    
    # Apply manual batch norm
    x_manual = batch_norm_with_params(x, gamma, beta)
    
    print("After manual batch norm with learnable params:")
    print(f"Output mean per feature: {x_manual.mean(dim=0)}")
    print(f"Output std per feature: {x_manual.std(dim=0, unbiased=False)}")
    print(f"Expected std (gamma values): {gamma}")
    print(f"Expected mean (beta values): {beta}")
    print()
    
    # Compare with PyTorch BatchNorm1d with affine=True
    bn_pytorch = nn.BatchNorm1d(num_features, affine=True)
    bn_pytorch.train()
    
    # Set the same gamma and beta values
    with torch.no_grad():
        bn_pytorch.weight.copy_(gamma)
        bn_pytorch.bias.copy_(beta)
    
    x_pytorch = bn_pytorch(x)
    
    print("PyTorch BatchNorm1d with same parameters:")
    print(f"Output mean per feature: {x_pytorch.mean(dim=0)}")
    print(f"Output std per feature: {x_pytorch.std(dim=0, unbiased=False)}")
    print()
    
    print(f"Results match: {torch.allclose(x_manual, x_pytorch, atol=1e-6)}")
    
    return x, x_manual, x_pytorch

# Run the test
test_batch_norm_with_params()


In [None]:
# Complete BatchNorm class implementation
class ManualBatchNorm1d(torch.nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
        
        # Learnable parameters (if affine=True)
        if self.affine:
            self.gamma = torch.nn.Parameter(torch.ones(num_features))
            self.beta = torch.nn.Parameter(torch.zeros(num_features))
        else:
            self.register_parameter('gamma', None)
            self.register_parameter('beta', None)
            
        # Running statistics (for inference)
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))
        self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
    
    def forward(self, x):
        # x should be of shape (N, C) or (N, C, L)
        if self.training:
            # Training mode: use batch statistics
            if x.dim() == 2:
                mean = x.mean(0)
                var = x.var(0, unbiased=False)
            else:
                # For 3D inputs (N, C, L), compute stats over N and L dimensions
                mean = x.mean((0, 2))
                var = x.var((0, 2), unbiased=False)
            
            # Update running statistics
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var
                self.num_batches_tracked.add_(1)
        else:
            # Inference mode: use running statistics
            mean = self.running_mean
            var = self.running_var
        
        # Normalize
        x_normalized = (x - mean) / torch.sqrt(var + self.eps)
        
        # Scale and shift
        if self.affine:
            if x.dim() == 2:
                x_normalized = self.gamma * x_normalized + self.beta
            else:
                # Reshape for broadcasting with 3D inputs
                x_normalized = self.gamma.view(1, -1, 1) * x_normalized + self.beta.view(1, -1, 1)
        
        return x_normalized

# Test our custom BatchNorm
print("Testing custom BatchNorm class:")
x_test = torch.randn(4, 3, 5)  # (N, C, L)
print(f"Input shape: {x_test.shape}")

# Initialize our custom BatchNorm
custom_bn = ManualBatchNorm1d(3)
custom_bn.train()

# Forward pass
output = custom_bn(x_test)
print(f"Output shape: {output.shape}")
print(f"Output mean per channel: {output.mean((0, 2))}")
print(f"Output std per channel: {output.std((0, 2), unbiased=False)}")

# Compare with PyTorch's BatchNorm1d
pytorch_bn = torch.nn.BatchNorm1d(3)
pytorch_output = pytorch_bn(x_test)
print(f"PyTorch BatchNorm mean per channel: {pytorch_output.mean((0, 2))}")
print(f"PyTorch BatchNorm std per channel: {pytorch_output.std((0, 2), unbiased=False)}")


In [64]:
B, T, C = 4, 10, 2
q = torch.randn((B, T, C))
k = torch.randn((B, T, C))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- 1. True Label (What the image *actually* is) ---
# In PyTorch for CrossEntropyLoss, the target (true label) should be
# an integer representing the class index.
# Let's say: 0 = cat, 1 = dog, 2 = bird
true_label = torch.tensor([1]) # This image is a DOG (index 1)
print(f"True Label (index): {true_label}")

# --- 2. Model's Predictions (Logits) ---
# Our model outputs "logits" before the softmax. Logits are raw, unnormalized scores.
# Higher scores mean the model is more confident in that class.
# Example: We predict it's a cat with score 0.5, dog with 2.0, bird with 0.1
# The higher the logit, the more the model "thinks" it's that class.
model_predictions_logits = torch.tensor([[0.5, 2.0, 0.1]])
# The shape is (batch_size, num_classes). Here, batch_size=1.
print(f"Model Predictions (logits): {model_predictions_logits}")

# --- 3. Applying Softmax (Optional for CrossEntropyLoss, but good for intuition) ---
# CrossEntropyLoss *internally* applies softmax, so you don't do it manually
# before passing to the loss function. But let's do it here to see probabilities.
probabilities = F.softmax(model_predictions_logits, dim=1)
print(f"Model Predictions (probabilities after softmax): {probabilities}")
# Notice: The dog probability (0.83) is highest, which is good since it's a dog.

# --- 4. Calculate Cross-Entropy Loss ---
# PyTorch's `nn.CrossEntropyLoss` is designed to take raw logits and true labels.
# It combines LogSoftmax and NLLLoss (Negative Log Likelihood Loss) for numerical stability.
loss_function = nn.CrossEntropyLoss()
loss = loss_function(model_predictions_logits, true_label)

print(f"\nCalculated Cross-Entropy Loss: {loss.item():.4f}")

# --- Let's see what happens if the model was wrong ---
print("\n--- Scenario 2: Model is wrong but confident ---")
# Suppose the true label is still DOG (index 1)
true_label_wrong_scenario = torch.tensor([1])

# But the model confidently predicted it was a CAT (index 0)
model_predictions_wrong_logits = torch.tensor([[5.0, 0.1, 0.1]]) # High score for cat
print(f"Model Predictions (wrong logits): {model_predictions_wrong_logits}")

probabilities_wrong = F.softmax(model_predictions_wrong_logits, dim=1)
print(f"Model Predictions (probabilities after softmax, wrong): {probabilities_wrong}")

loss_wrong = loss_function(model_predictions_wrong_logits, true_label_wrong_scenario)
print(f"Calculated Cross-Entropy Loss (model confidently wrong): {loss_wrong.item():.4f}")

# --- Let's see what happens if the model was less confident but still wrong ---
print("\n--- Scenario 3: Model is wrong but less confident ---")
# True label is still DOG (index 1)
true_label_less_confident_wrong_scenario = torch.tensor([1])

# Model predicted cat, but less confidently
model_predictions_less_confident_wrong_logits = torch.tensor([[1.0, 0.1, 0.1]]) # Lower score for cat
print(f"Model Predictions (less confident wrong logits): {model_predictions_less_confident_wrong_logits}")

probabilities_less_confident_wrong = F.softmax(model_predictions_less_confident_wrong_logits, dim=1)
print(f"Model Predictions (probabilities after softmax, less confident wrong): {probabilities_less_confident_wrong}")

loss_less_confident_wrong = loss_function(model_predictions_less_confident_wrong_logits, true_label_less_confident_wrong_scenario)
print(f"Calculated Cross-Entropy Loss (model less confidently wrong): {loss_less_confident_wrong.item():.4f}")