# Lab 0.4: NumPy Vectorization as GPU Training

**Chapter 0: The Parallel Mindset**

NumPy's vectorized operations are excellent training for GPU thinking. This lab bridges the gap.

## Learning Objectives
- Understand why vectorized code is faster than loops
- Practice eliminating loops with array operations
- See how NumPy thinking translates to GPU kernels

In [None]:
import numpy as np
import time

## Part 1: Loops vs Vectorization

Python loops are slow. NumPy operations are fast. Why?

In [None]:
# Create test data
n = 1_000_000
a = np.random.rand(n)
b = np.random.rand(n)

# Method 1: Python loop (SLOW)
def add_loop(a, b):
    result = np.empty_like(a)
    for i in range(len(a)):
        result[i] = a[i] + b[i]
    return result

# Method 2: NumPy vectorized (FAST)
def add_vectorized(a, b):
    return a + b

# Benchmark
start = time.time()
result_loop = add_loop(a, b)
loop_time = time.time() - start

start = time.time()
result_vec = add_vectorized(a, b)
vec_time = time.time() - start

print(f"Loop time: {loop_time:.3f}s")
print(f"Vectorized time: {vec_time:.6f}s")
print(f"Speedup: {loop_time / vec_time:.0f}x")
print(f"\nResults match: {np.allclose(result_loop, result_vec)}")

### Why is vectorization faster?

1. **No Python overhead per element**: Loop overhead, type checking, etc.
2. **SIMD instructions**: CPU processes multiple elements per instruction
3. **Better cache usage**: Predictable memory access patterns
4. **Compiled C code**: NumPy operations run optimized C, not Python

## Part 2: Common Vectorization Patterns

Learn to recognize and apply these patterns.

In [None]:
# Pattern 1: Element-wise operations
x = np.random.rand(1000)

# Bad (loop)
def sigmoid_loop(x):
    result = np.empty_like(x)
    for i in range(len(x)):
        result[i] = 1 / (1 + np.exp(-x[i]))
    return result

# Good (vectorized)
def sigmoid_vec(x):
    return 1 / (1 + np.exp(-x))

print("Pattern 1: Element-wise")
print(f"Results match: {np.allclose(sigmoid_loop(x), sigmoid_vec(x))}")

In [None]:
# Pattern 2: Conditional operations with np.where
x = np.random.randn(1000)

# Bad (loop with if)
def relu_loop(x):
    result = np.empty_like(x)
    for i in range(len(x)):
        if x[i] > 0:
            result[i] = x[i]
        else:
            result[i] = 0
    return result

# Good (vectorized)
def relu_vec(x):
    return np.maximum(0, x)
    # Or: return np.where(x > 0, x, 0)

print("Pattern 2: Conditionals -> np.where / np.maximum")
print(f"Results match: {np.allclose(relu_loop(x), relu_vec(x))}")

In [None]:
# Pattern 3: Reductions
x = np.random.rand(1000)

# Bad (loop)
def mean_loop(x):
    total = 0
    for val in x:
        total += val
    return total / len(x)

# Good (vectorized)
def mean_vec(x):
    return x.mean()  # or np.mean(x)

print("Pattern 3: Reductions -> .sum(), .mean(), .max(), etc.")
print(f"Results match: {np.isclose(mean_loop(x), mean_vec(x))}")

In [None]:
# Pattern 4: Broadcasting
# Add a bias to each row of a matrix
X = np.random.rand(100, 50)  # 100 samples, 50 features
bias = np.random.rand(50)    # 50-element bias vector

# Bad (loop)
def add_bias_loop(X, bias):
    result = np.empty_like(X)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            result[i, j] = X[i, j] + bias[j]
    return result

# Good (broadcasting)
def add_bias_vec(X, bias):
    return X + bias  # bias broadcasts across rows

print("Pattern 4: Broadcasting")
print(f"X shape: {X.shape}, bias shape: {bias.shape}")
print(f"Result shape: {add_bias_vec(X, bias).shape}")
print(f"Results match: {np.allclose(add_bias_loop(X, bias), add_bias_vec(X, bias))}")

## Part 3: From NumPy to GPU Thinking

NumPy vectorization teaches the right mental model for GPUs.

In [None]:
# NumPy way: Think about the whole array operation
def softmax_numpy(x):
    """Compute softmax along last axis."""
    # Shift for numerical stability
    x_shifted = x - x.max(axis=-1, keepdims=True)
    # Exponentiate
    exp_x = np.exp(x_shifted)
    # Normalize
    return exp_x / exp_x.sum(axis=-1, keepdims=True)

# GPU kernel way: Think about what ONE thread does
# (Pseudocode - actual Triton kernel would look similar)
def softmax_gpu_pseudocode():
    """
    # Each thread block handles one row
    row = program_id  # Which row am I processing?
    
    # Load row into shared memory
    x = load(input[row, :])
    
    # Find max (reduction within block)
    max_val = block_reduce_max(x)
    
    # Shift and exponentiate
    exp_x = exp(x - max_val)
    
    # Sum (reduction within block)
    sum_exp = block_reduce_sum(exp_x)
    
    # Normalize and store
    output[row, :] = exp_x / sum_exp
    """
    pass

# Test softmax
x = np.random.randn(32, 128)  # Batch of 32, vocab size 128
result = softmax_numpy(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {result.shape}")
print(f"Each row sums to 1: {np.allclose(result.sum(axis=-1), 1)}")
print(f"\nNumPy and GPU kernel do the same operations,")
print(f"just organized differently (whole array vs per-thread).")

## Exercise: Vectorize These Functions

Convert the loop-based implementations to vectorized NumPy.

In [None]:
# Exercise 1: L2 Normalization
def l2_normalize_loop(x):
    """Normalize each row to unit length."""
    result = np.empty_like(x)
    for i in range(x.shape[0]):
        norm = 0
        for j in range(x.shape[1]):
            norm += x[i, j] ** 2
        norm = np.sqrt(norm)
        for j in range(x.shape[1]):
            result[i, j] = x[i, j] / norm
    return result

# Your vectorized solution:
def l2_normalize_vec(x):
    # TODO: Implement without loops
    pass

In [None]:
# Solution:
def l2_normalize_vec(x):
    norm = np.linalg.norm(x, axis=-1, keepdims=True)
    return x / norm

# Test
x = np.random.randn(100, 50)
print(f"Results match: {np.allclose(l2_normalize_loop(x), l2_normalize_vec(x))}")

In [None]:
# Exercise 2: Pairwise distances
def pairwise_dist_loop(X, Y):
    """Compute distance between each pair of points."""
    m, n = X.shape[0], Y.shape[0]
    result = np.empty((m, n))
    for i in range(m):
        for j in range(n):
            diff = X[i] - Y[j]
            result[i, j] = np.sqrt(np.sum(diff ** 2))
    return result

# Your vectorized solution:
def pairwise_dist_vec(X, Y):
    # TODO: Implement without loops
    # Hint: Use broadcasting and the identity ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a.b
    pass

In [None]:
# Solution:
def pairwise_dist_vec(X, Y):
    # ||X - Y||^2 = ||X||^2 + ||Y||^2 - 2 * X @ Y.T
    X_sq = np.sum(X ** 2, axis=1, keepdims=True)  # (m, 1)
    Y_sq = np.sum(Y ** 2, axis=1, keepdims=True)  # (n, 1)
    cross = X @ Y.T                                # (m, n)
    dist_sq = X_sq + Y_sq.T - 2 * cross
    return np.sqrt(np.maximum(dist_sq, 0))  # Clip for numerical stability

# Test
X = np.random.randn(50, 10)
Y = np.random.randn(30, 10)
print(f"Results match: {np.allclose(pairwise_dist_loop(X, Y), pairwise_dist_vec(X, Y))}")

# Benchmark
start = time.time()
_ = pairwise_dist_loop(X, Y)
loop_time = time.time() - start

start = time.time()
_ = pairwise_dist_vec(X, Y)
vec_time = time.time() - start

print(f"Speedup: {loop_time / vec_time:.1f}x")

## Key Takeaways

1. **Vectorization eliminates Python overhead**: 10-100x speedup is common
2. **Think in arrays, not elements**: "What happens to all elements?" not "What happens to element i?"
3. **Broadcasting is powerful**: Avoid explicit loops for dimension-expanding ops
4. **NumPy -> GPU**: Same mental model, just replace array ops with kernel launches
5. **Practice recognizing patterns**: Element-wise, reduction, broadcasting - they all have GPU equivalents