# Introduction to Deep Learning - Week 1 Exercises
## Neural Network Basics with PyTorch

In this notebook, you will:
- Get familiar with PyTorch tensors
- Implement and visualize activation functions
- Understand forward propagation
- Build a simple neural network from scratch

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. PyTorch Basics (15-20 minutes)

### Exercise 1.1: Creating and Manipulating Tensors

In [None]:
# (a) Create a tensor from a list
# TODO: Create a tensor from [1, 2, 3, 4, 5]
tensor_a = None  # Replace with your code

# Print shape and dtype
print(f"Shape: {tensor_a.shape}")
print(f"Data type: {tensor_a.dtype}")

In [None]:
# (b) Create a 3x4 tensor with random values from standard normal distribution
# TODO: Use torch.randn()
tensor_b = None  # Replace with your code

print(f"Tensor shape: {tensor_b.shape}")
print(tensor_b)

In [None]:
# (c) Create tensors of ones and zeros
# TODO: Create 2x3 tensors
ones_tensor = None  # Replace with your code
zeros_tensor = None  # Replace with your code

print("Ones tensor:")
print(ones_tensor)
print("\nZeros tensor:")
print(zeros_tensor)

In [None]:
# (d) Reshape tensors
# TODO: Reshape tensor_b to (2, 6) then to (12, 1)
reshaped_1 = None  # tensor_b reshaped to (2, 6)
reshaped_2 = None  # tensor_b reshaped to (12, 1)

print(f"Original shape: {tensor_b.shape}")
print(f"Reshaped to (2, 6): {reshaped_1.shape}")
print(f"Reshaped to (12, 1): {reshaped_2.shape}")

### Exercise 1.2: Tensor Operations

In [None]:
# (a) Dot product
x = torch.tensor([1.0, 2.0, 3.0])
w = torch.tensor([0.5, -0.3, 0.8])

# TODO: Compute dot product
dot_product = None  # Replace with your code

print(f"Dot product: {dot_product}")

In [None]:
# (b) Matrix-vector multiplication
W = torch.randn(3, 2)
x = torch.randn(2)

# TODO: Compute W @ x using torch.matmul() or @ operator
result = None  # Replace with your code

print(f"W shape: {W.shape}")
print(f"x shape: {x.shape}")
print(f"Result shape: {result.shape}")
print(f"Result: {result}")

In [None]:
# (c) Broadcasting
A = torch.randn(2, 3)
B = torch.randn(2, 3)
C = torch.randn(3)

# TODO: Element-wise addition
sum_AB = None  # A + B
sum_AC = None  # A + C (broadcasting)

print(f"A + B shape: {sum_AB.shape}")
print(f"A + C shape: {sum_AC.shape}")
print("\nBroadcasting example:")
print(f"A shape: {A.shape}, C shape: {C.shape}")
print("C is broadcast to match A's shape!")

## 2. Activation Functions (20 minutes)

### Exercise 2.1: Implement Activation Functions

In [None]:
def sigmoid(z):
    """
    Sigmoid activation function: Ïƒ(z) = 1 / (1 + exp(-z))
    
    Args:
        z: Input tensor
    Returns:
        Output tensor with sigmoid applied element-wise
    """
    # TODO: Implement sigmoid
    pass

def tanh(z):
    """
    Tanh activation function: tanh(z) = (exp(z) - exp(-z)) / (exp(z) + exp(-z))
    
    Args:
        z: Input tensor
    Returns:
        Output tensor with tanh applied element-wise
    """
    # TODO: Implement tanh
    pass

def relu(z):
    """
    ReLU activation function: ReLU(z) = max(0, z)
    
    Args:
        z: Input tensor
    Returns:
        Output tensor with ReLU applied element-wise
    """
    # TODO: Implement ReLU
    # Hint: Use torch.maximum() or torch.clamp()
    pass

# Test your implementations
test_input = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
print(f"Input: {test_input}")
print(f"Sigmoid: {sigmoid(test_input)}")
print(f"Tanh: {tanh(test_input)}")
print(f"ReLU: {relu(test_input)}")

### Exercise 2.2: Visualize Activation Functions

In [None]:
# Create input range
z = torch.linspace(-5, 5, 100)

# TODO: Apply activation functions
sig_output = None  # sigmoid(z)
tanh_output = None  # tanh(z)
relu_output = None  # relu(z)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(z.numpy(), sig_output.numpy(), label='Sigmoid', linewidth=2)
plt.plot(z.numpy(), tanh_output.numpy(), label='Tanh', linewidth=2)
plt.plot(z.numpy(), relu_output.numpy(), label='ReLU', linewidth=2)
plt.xlabel('z', fontsize=12)
plt.ylabel('Activation', fontsize=12)
plt.title('Activation Functions', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.show()

In [None]:
# Compare with PyTorch built-in functions
print("Comparing custom implementations with PyTorch built-ins:")
test_z = torch.tensor([0.0, 1.0, -1.0])

print(f"\nSigmoid - Custom: {sigmoid(test_z)}")
print(f"Sigmoid - PyTorch: {torch.sigmoid(test_z)}")

print(f"\nTanh - Custom: {tanh(test_z)}")
print(f"Tanh - PyTorch: {torch.tanh(test_z)}")

print(f"\nReLU - Custom: {relu(test_z)}")
print(f"ReLU - PyTorch: {torch.relu(test_z)}")

### Exercise 2.3: Derivatives of Activation Functions

In [None]:
def sigmoid_derivative(z):
    """
    Derivative of sigmoid: Ïƒ'(z) = Ïƒ(z) * (1 - Ïƒ(z))
    """
    # TODO: Implement
    pass

def tanh_derivative(z):
    """
    Derivative of tanh: tanh'(z) = 1 - tanhÂ²(z)
    """
    # TODO: Implement
    pass

def relu_derivative(z):
    """
    Derivative of ReLU: ReLU'(z) = 1 if z > 0, else 0
    """
    # TODO: Implement
    # Hint: Use (z > 0).float() to convert boolean to float
    pass

In [None]:
# Plot activation functions and their derivatives
z = torch.linspace(-5, 5, 100)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Sigmoid
axes[0].plot(z.numpy(), sigmoid(z).numpy(), label='Ïƒ(z)', linewidth=2)
axes[0].plot(z.numpy(), sigmoid_derivative(z).numpy(), label="Ïƒ'(z)", linewidth=2, linestyle='--')
axes[0].set_title('Sigmoid', fontsize=12)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Tanh
axes[1].plot(z.numpy(), tanh(z).numpy(), label='tanh(z)', linewidth=2)
axes[1].plot(z.numpy(), tanh_derivative(z).numpy(), label="tanh'(z)", linewidth=2, linestyle='--')
axes[1].set_title('Tanh', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# ReLU
axes[2].plot(z.numpy(), relu(z).numpy(), label='ReLU(z)', linewidth=2)
axes[2].plot(z.numpy(), relu_derivative(z).numpy(), label="ReLU'(z)", linewidth=2, linestyle='--')
axes[2].set_title('ReLU', fontsize=12)
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Exercise 2.4: Vanishing Gradient Problem

In [None]:
# Compute sigmoid derivative at different points
z_values = torch.tensor([-10.0, -5.0, 0.0, 5.0, 10.0])

# TODO: Compute sigmoid derivatives
derivatives = None  # sigmoid_derivative(z_values)

print("Sigmoid derivative at different z values:")
for z, deriv in zip(z_values, derivatives):
    print(f"z = {z:6.1f}, Ïƒ'(z) = {deriv:.6f}")

print("\nObservation: When |z| is large, the gradient is very small (close to 0).")
print("This is the vanishing gradient problem!")

**Discussion Questions:**
1. What happens to the gradient when |z| is large?
2. Why is this a problem for training deep networks?
3. How does ReLU help with this problem?

## 3. Forward Propagation by Hand (25 minutes)

### Exercise 3.1: Manual Computation

Network structure:
- Input: x = [1, 2]
- Hidden layer: 2 neurons, ReLU activation
- Output layer: 1 neuron, no activation

In [None]:
# Define network parameters
x = torch.tensor([1.0, 2.0])

# Hidden layer weights and bias
W1 = torch.tensor([[0.5, -0.3],
                   [0.8, 0.2]])
b1 = torch.tensor([0.1, -0.2])

# Output layer weights and bias
w2 = torch.tensor([1.0, -0.5])
b2 = torch.tensor([0.3])

print("Network parameters:")
print(f"Input x: {x}")
print(f"\nHidden layer:")
print(f"W1 shape: {W1.shape}")
print(f"W1:\n{W1}")
print(f"b1: {b1}")
print(f"\nOutput layer:")
print(f"w2: {w2}")
print(f"b2: {b2}")

In [None]:
# TODO: Compute forward pass step by step

# Step 1: Pre-activation of hidden layer
z1 = None  # W1 @ x + b1
print(f"z1 (pre-activation hidden): {z1}")

# Step 2: Activation of hidden layer (ReLU)
a1 = None  # relu(z1)
print(f"a1 (activation hidden): {a1}")

# Step 3: Pre-activation of output layer
z2 = None  # w2 @ a1 + b2
print(f"z2 (pre-activation output): {z2}")

# Step 4: Output (no activation)
output = z2
print(f"\nFinal output: {output}")

**Verify your calculation by hand:**

1. $z_1 = W^{(1)} x + b^{(1)} = \begin{bmatrix} 0.5 & -0.3 \\ 0.8 & 0.2 \end{bmatrix} \begin{bmatrix} 1 \\ 2 \end{bmatrix} + \begin{bmatrix} 0.1 \\ -0.2 \end{bmatrix} = ?$

2. $a_1 = \text{ReLU}(z_1) = ?$

3. $z_2 = w^{(2)T} a_1 + b^{(2)} = ?$

## 4. Building a Simple Neural Network (30 minutes)

### Exercise 4.1: Implement a 2-Layer MLP

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        Simple 2-layer MLP
        
        Args:
            input_dim: Dimension of input
            hidden_dim: Dimension of hidden layer
            output_dim: Dimension of output
        """
        super(SimpleMLP, self).__init__()
        
        # TODO: Initialize layers
        # Hint: Use nn.Linear(in_features, out_features)
        self.fc1 = None  # First linear layer
        self.fc2 = None  # Second linear layer
        
    def forward(self, x):
        """
        Forward pass
        
        Args:
            x: Input tensor of shape (batch_size, input_dim)
        Returns:
            Output tensor of shape (batch_size, output_dim)
        """
        # TODO: Implement forward pass
        # Layer 1: Linear -> ReLU
        # Layer 2: Linear (no activation)
        
        pass

# Create model
model = SimpleMLP(input_dim=2, hidden_dim=4, output_dim=1)
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters())}")

### Exercise 4.2: Test on XOR Problem

In [None]:
# Create XOR dataset
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

print("XOR Dataset:")
for i in range(len(X)):
    print(f"Input: {X[i].numpy()}, Target: {y[i].item()}")

# Forward pass through untrained network
with torch.no_grad():
    predictions = model(X)

print("\nUntrained network predictions:")
for i in range(len(X)):
    print(f"Input: {X[i].numpy()}, Prediction: {predictions[i].item():.4f}, Target: {y[i].item()}")

print("\nNote: Predictions are random because the network is untrained!")

### Exercise 4.3: Visualize Decision Boundary

In [None]:
def plot_decision_boundary(model, X, y):
    """
    Plot decision boundary of the model
    """
    # Create a mesh grid
    x_min, x_max = -0.5, 1.5
    y_min, y_max = -0.5, 1.5
    h = 0.01
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # TODO: Create input tensor from mesh grid
    # Hint: Stack xx.ravel() and yy.ravel(), then convert to torch tensor
    grid_points = None  # Shape should be (n_points, 2)
    
    # Get predictions
    with torch.no_grad():
        Z = model(grid_points)
    
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z.numpy(), levels=20, cmap='RdYlBu', alpha=0.8)
    plt.colorbar(label='Network Output')
    
    # Plot data points
    plt.scatter(X[:, 0], X[:, 1], c=y.squeeze(), cmap='RdYlBu', 
                edgecolors='black', s=200, linewidths=2)
    
    plt.xlabel('x1', fontsize=12)
    plt.ylabel('x2', fontsize=12)
    plt.title('Decision Boundary (Untrained Network)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.show()

# Plot decision boundary
plot_decision_boundary(model, X, y)

### Exercise 4.4: Count Parameters

In [None]:
# Manual calculation
input_dim = 2
hidden_dim = 4
output_dim = 1

# TODO: Calculate number of parameters manually
# Layer 1: input_dim * hidden_dim + hidden_dim (weights + biases)
# Layer 2: hidden_dim * output_dim + output_dim (weights + biases)

params_layer1 = None  # Replace with calculation
params_layer2 = None  # Replace with calculation
total_params_manual = None  # Replace with calculation

print(f"Layer 1 parameters: {params_layer1}")
print(f"Layer 2 parameters: {params_layer2}")
print(f"Total parameters (manual): {total_params_manual}")

# Verify with PyTorch
total_params_pytorch = sum(p.numel() for p in model.parameters())
print(f"Total parameters (PyTorch): {total_params_pytorch}")

# Detailed breakdown
print("\nDetailed parameter breakdown:")
for name, param in model.named_parameters():
    print(f"{name}: {param.shape} -> {param.numel()} parameters")

## 5. Wrap-up Questions

Answer these questions based on what you've learned:

1. **Why can't we use a linear activation function in hidden layers?**
   - Your answer: 

2. **Why is ReLU more popular than sigmoid for hidden layers?**
   - Your answer: 

3. **How many parameters would a 3-layer MLP have with dimensions [10, 50, 50, 5]?**
   - Your answer: 

4. **Can a single-layer perceptron solve XOR? Why or why not?**
   - Your answer: 

In [None]:
# Question 3: Calculate parameters for a 3-layer MLP
dims = [10, 50, 50, 5]

# TODO: Calculate total parameters
total = 0
for i in range(len(dims) - 1):
    layer_params = None  # Calculate params for layer i
    total += layer_params
    print(f"Layer {i+1}: {dims[i]} -> {dims[i+1]}, Parameters: {layer_params}")

print(f"\nTotal parameters: {total}")

## Next Week Preview

Next week we'll learn:
- How to compute gradients efficiently using **backpropagation**
- How to actually **train** these networks
- Different **optimization algorithms** (SGD, Adam, etc.)

Make sure you understand:
- The chain rule from calculus
- Matrix multiplication and dimensions
- How to compute gradients of simple functions

Great work! ðŸŽ‰