# Introduction to Deep Learning - Week 1 Exercises SOLUTIONS
## Neural Network Basics with PyTorch

This notebook contains complete solutions to all Week 1 exercises.

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. PyTorch Basics

### Exercise 1.1: Creating and Manipulating Tensors

In [None]:
# (a) Create a tensor from a list
tensor_a = torch.tensor([1, 2, 3, 4, 5])

print(f"Shape: {tensor_a.shape}")
print(f"Data type: {tensor_a.dtype}")

In [None]:
# (b) Create a 3x4 tensor with random values from standard normal distribution
tensor_b = torch.randn(3, 4)

print(f"Tensor shape: {tensor_b.shape}")
print(tensor_b)

In [None]:
# (c) Create tensors of ones and zeros
ones_tensor = torch.ones(2, 3)
zeros_tensor = torch.zeros(2, 3)

print("Ones tensor:")
print(ones_tensor)
print("\nZeros tensor:")
print(zeros_tensor)

In [None]:
# (d) Reshape tensors
reshaped_1 = tensor_b.reshape(2, 6)
reshaped_2 = tensor_b.reshape(12, 1)
# Alternative: tensor_b.view(2, 6) or tensor_b.view(12, 1)

print(f"Original shape: {tensor_b.shape}")
print(f"Reshaped to (2, 6): {reshaped_1.shape}")
print(f"Reshaped to (12, 1): {reshaped_2.shape}")

### Exercise 1.2: Tensor Operations

In [None]:
# (a) Dot product
x = torch.tensor([1.0, 2.0, 3.0])
w = torch.tensor([0.5, -0.3, 0.8])

dot_product = torch.dot(x, w)

print(f"Dot product: {dot_product}")
print(f"Manual verification: 1*0.5 + 2*(-0.3) + 3*0.8 = {1*0.5 + 2*(-0.3) + 3*0.8}")

In [None]:
# (b) Matrix-vector multiplication
W = torch.randn(3, 2)
x = torch.randn(2)

result = torch.matmul(W, x)
# Alternative: result = W @ x

print(f"W shape: {W.shape}")
print(f"x shape: {x.shape}")
print(f"Result shape: {result.shape}")
print(f"Result: {result}")

In [None]:
# (c) Broadcasting
A = torch.randn(2, 3)
B = torch.randn(2, 3)
C = torch.randn(3)

sum_AB = A + B  # Element-wise addition
sum_AC = A + C  # Broadcasting: C is broadcast to (2, 3)

print(f"A + B shape: {sum_AB.shape}")
print(f"A + C shape: {sum_AC.shape}")
print("\nBroadcasting example:")
print(f"A shape: {A.shape}, C shape: {C.shape}")
print("C is broadcast to match A's shape!")
print(f"\nA:\n{A}")
print(f"\nC: {C}")
print(f"\nA + C:\n{sum_AC}")

## 2. Activation Functions

### Exercise 2.1: Implement Activation Functions

In [None]:
def sigmoid(z):
    """Sigmoid activation: œÉ(z) = 1 / (1 + exp(-z))"""
    return 1 / (1 + torch.exp(-z))

def tanh(z):
    """Tanh activation: tanh(z) = (exp(z) - exp(-z)) / (exp(z) + exp(-z))"""
    return (torch.exp(z) - torch.exp(-z)) / (torch.exp(z) + torch.exp(-z))

def relu(z):
    """ReLU activation: ReLU(z) = max(0, z)"""
    return torch.maximum(torch.tensor(0.0), z)
    # Alternative: torch.clamp(z, min=0)

# Test implementations
test_input = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
print(f"Input: {test_input}")
print(f"Sigmoid: {sigmoid(test_input)}")
print(f"Tanh: {tanh(test_input)}")
print(f"ReLU: {relu(test_input)}")

### Exercise 2.2: Visualize Activation Functions

In [None]:
# Create input range
z = torch.linspace(-5, 5, 100)

# Apply activation functions
sig_output = sigmoid(z)
tanh_output = tanh(z)
relu_output = relu(z)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(z.numpy(), sig_output.numpy(), label='Sigmoid', linewidth=2)
plt.plot(z.numpy(), tanh_output.numpy(), label='Tanh', linewidth=2)
plt.plot(z.numpy(), relu_output.numpy(), label='ReLU', linewidth=2)
plt.xlabel('z', fontsize=12)
plt.ylabel('Activation', fontsize=12)
plt.title('Activation Functions', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.axvline(x=0, color='k', linewidth=0.5)
plt.show()

In [None]:
# Compare with PyTorch built-in functions
print("Comparing custom implementations with PyTorch built-ins:")
test_z = torch.tensor([0.0, 1.0, -1.0])

print(f"\nSigmoid - Custom: {sigmoid(test_z)}")
print(f"Sigmoid - PyTorch: {torch.sigmoid(test_z)}")
print(f"Match: {torch.allclose(sigmoid(test_z), torch.sigmoid(test_z))}")

print(f"\nTanh - Custom: {tanh(test_z)}")
print(f"Tanh - PyTorch: {torch.tanh(test_z)}")
print(f"Match: {torch.allclose(tanh(test_z), torch.tanh(test_z))}")

print(f"\nReLU - Custom: {relu(test_z)}")
print(f"ReLU - PyTorch: {torch.relu(test_z)}")
print(f"Match: {torch.allclose(relu(test_z), torch.relu(test_z))}")

### Exercise 2.3: Derivatives of Activation Functions

In [None]:
def sigmoid_derivative(z):
    """Derivative of sigmoid: œÉ'(z) = œÉ(z) * (1 - œÉ(z))"""
    s = sigmoid(z)
    return s * (1 - s)

def tanh_derivative(z):
    """Derivative of tanh: tanh'(z) = 1 - tanh¬≤(z)"""
    t = tanh(z)
    return 1 - t**2

def relu_derivative(z):
    """Derivative of ReLU: ReLU'(z) = 1 if z > 0, else 0"""
    return (z > 0).float()

In [None]:
# Plot activation functions and their derivatives
z = torch.linspace(-5, 5, 100)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Sigmoid
axes[0].plot(z.numpy(), sigmoid(z).numpy(), label='œÉ(z)', linewidth=2)
axes[0].plot(z.numpy(), sigmoid_derivative(z).numpy(), label="œÉ'(z)", linewidth=2, linestyle='--')
axes[0].set_title('Sigmoid', fontsize=12)
axes[0].set_xlabel('z')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='k', linewidth=0.5)
axes[0].axvline(x=0, color='k', linewidth=0.5)

# Tanh
axes[1].plot(z.numpy(), tanh(z).numpy(), label='tanh(z)', linewidth=2)
axes[1].plot(z.numpy(), tanh_derivative(z).numpy(), label="tanh'(z)", linewidth=2, linestyle='--')
axes[1].set_title('Tanh', fontsize=12)
axes[1].set_xlabel('z')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=0, color='k', linewidth=0.5)
axes[1].axvline(x=0, color='k', linewidth=0.5)

# ReLU
axes[2].plot(z.numpy(), relu(z).numpy(), label='ReLU(z)', linewidth=2)
axes[2].plot(z.numpy(), relu_derivative(z).numpy(), label="ReLU'(z)", linewidth=2, linestyle='--')
axes[2].set_title('ReLU', fontsize=12)
axes[2].set_xlabel('z')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].axhline(y=0, color='k', linewidth=0.5)
axes[2].axvline(x=0, color='k', linewidth=0.5)

plt.tight_layout()
plt.show()

### Exercise 2.4: Vanishing Gradient Problem

In [None]:
# Compute sigmoid derivative at different points
z_values = torch.tensor([-10.0, -5.0, 0.0, 5.0, 10.0])
derivatives = sigmoid_derivative(z_values)

print("Sigmoid derivative at different z values:")
for z, deriv in zip(z_values, derivatives):
    print(f"z = {z:6.1f}, œÉ'(z) = {deriv:.6f}")

print("\n" + "="*60)
print("OBSERVATION: Vanishing Gradient Problem")
print("="*60)
print("When |z| is large (e.g., z = ¬±10), the gradient is very small (~0.000045).")
print("Maximum gradient occurs at z = 0, where œÉ'(0) = 0.25")
print("\nWhy is this a problem?")
print("- In deep networks, gradients are multiplied across layers (chain rule)")
print("- Small gradients (< 0.25) multiplied many times ‚Üí vanishingly small")
print("- Early layers receive almost zero gradient ‚Üí don't learn!")
print("\nReLU helps because:")
print("- ReLU'(z) = 1 for z > 0 (no saturation)")
print("- Gradients don't vanish for positive activations")

## 3. Forward Propagation by Hand

### Exercise 3.1 & 3.2: Manual Computation and Verification

In [None]:
# Define network parameters
x = torch.tensor([1.0, 2.0])

# Hidden layer weights and bias
W1 = torch.tensor([[0.5, -0.3],
                   [0.8, 0.2]])
b1 = torch.tensor([0.1, -0.2])

# Output layer weights and bias
w2 = torch.tensor([1.0, -0.5])
b2 = torch.tensor([0.3])

print("Network parameters:")
print(f"Input x: {x}")
print(f"\nHidden layer:")
print(f"W1:\n{W1}")
print(f"b1: {b1}")
print(f"\nOutput layer:")
print(f"w2: {w2}")
print(f"b2: {b2}")

In [None]:
# Forward pass computation
print("\n" + "="*60)
print("FORWARD PASS COMPUTATION")
print("="*60)

# Step 1: Pre-activation of hidden layer
z1 = W1 @ x + b1
print(f"\nStep 1: z1 = W1 @ x + b1")
print(f"z1 = {z1}")
print(f"\nManual calculation:")
print(f"z1[0] = 0.5*1 + (-0.3)*2 + 0.1 = {0.5*1 + (-0.3)*2 + 0.1}")
print(f"z1[1] = 0.8*1 + 0.2*2 + (-0.2) = {0.8*1 + 0.2*2 + (-0.2)}")

# Step 2: Activation of hidden layer (ReLU)
a1 = relu(z1)
print(f"\nStep 2: a1 = ReLU(z1)")
print(f"a1 = {a1}")

# Step 3: Pre-activation of output layer
z2 = w2 @ a1 + b2
print(f"\nStep 3: z2 = w2 @ a1 + b2")
print(f"z2 = {z2}")
print(f"\nManual calculation:")
print(f"z2 = 1.0*{a1[0].item()} + (-0.5)*{a1[1].item()} + 0.3 = {1.0*a1[0].item() + (-0.5)*a1[1].item() + 0.3}")

# Step 4: Output (no activation)
output = z2
print(f"\nStep 4: output = z2 (no activation)")
print(f"Final output: {output}")

## 4. Building a Simple Neural Network

### Exercise 4.1: Implement a 2-Layer MLP

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        Simple 2-layer MLP
        
        Args:
            input_dim: Dimension of input
            hidden_dim: Dimension of hidden layer
            output_dim: Dimension of output
        """
        super(SimpleMLP, self).__init__()
        
        # Initialize layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        """
        Forward pass
        
        Args:
            x: Input tensor of shape (batch_size, input_dim)
        Returns:
            Output tensor of shape (batch_size, output_dim)
        """
        # Layer 1: Linear -> ReLU
        x = self.fc1(x)
        x = F.relu(x)
        
        # Layer 2: Linear (no activation)
        x = self.fc2(x)
        
        return x

# Create model
model = SimpleMLP(input_dim=2, hidden_dim=4, output_dim=1)
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters())}")

### Exercise 4.2: Test on XOR Problem

In [None]:
# Create XOR dataset
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

print("XOR Dataset:")
print("Input  | Target")
print("-" * 20)
for i in range(len(X)):
    print(f"{X[i].numpy()} | {y[i].item():.0f}")

# Forward pass through untrained network
with torch.no_grad():
    predictions = model(X)

print("\nUntrained Network Predictions:")
print("Input  | Prediction | Target | Error")
print("-" * 50)
for i in range(len(X)):
    error = abs(predictions[i].item() - y[i].item())
    print(f"{X[i].numpy()} | {predictions[i].item():10.4f} | {y[i].item():6.0f} | {error:.4f}")

print("\n" + "="*50)
print("Note: Predictions are random because weights are randomly initialized!")
print("Next week, we'll learn how to train the network to solve XOR.")
print("="*50)

### Exercise 4.3: Visualize Decision Boundary

In [None]:
def plot_decision_boundary(model, X, y):
    """
    Plot decision boundary of the model
    """
    # Create a mesh grid
    x_min, x_max = -0.5, 1.5
    y_min, y_max = -0.5, 1.5
    h = 0.01
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # Create input tensor from mesh grid
    grid_points = torch.tensor(
        np.c_[xx.ravel(), yy.ravel()], 
        dtype=torch.float32
    )
    
    # Get predictions
    with torch.no_grad():
        Z = model(grid_points)
    
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.figure(figsize=(8, 6))
    contour = plt.contourf(xx, yy, Z.numpy(), levels=20, cmap='RdYlBu', alpha=0.8)
    plt.colorbar(contour, label='Network Output')
    
    # Plot data points
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y.squeeze(), cmap='RdYlBu', 
                         edgecolors='black', s=200, linewidths=2, zorder=10)
    
    # Add labels for points
    for i in range(len(X)):
        plt.text(X[i, 0], X[i, 1] + 0.1, f'y={int(y[i].item())}', 
                ha='center', fontsize=10, fontweight='bold')
    
    plt.xlabel('x‚ÇÅ', fontsize=12)
    plt.ylabel('x‚ÇÇ', fontsize=12)
    plt.title('Decision Boundary (Untrained Network)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Plot decision boundary
plot_decision_boundary(model, X, y)

print("\nObservations:")
print("- The decision boundary is random (untrained weights)")
print("- The network cannot solve XOR yet")
print("- With training, we can learn the correct non-linear boundary")

### Exercise 4.4: Count Parameters

In [None]:
# Manual calculation
input_dim = 2
hidden_dim = 4
output_dim = 1

# Layer 1: input_dim * hidden_dim + hidden_dim
params_layer1 = input_dim * hidden_dim + hidden_dim
print(f"Layer 1 (fc1):")
print(f"  Weights: {input_dim} √ó {hidden_dim} = {input_dim * hidden_dim}")
print(f"  Biases: {hidden_dim}")
print(f"  Total: {params_layer1}")

# Layer 2: hidden_dim * output_dim + output_dim
params_layer2 = hidden_dim * output_dim + output_dim
print(f"\nLayer 2 (fc2):")
print(f"  Weights: {hidden_dim} √ó {output_dim} = {hidden_dim * output_dim}")
print(f"  Biases: {output_dim}")
print(f"  Total: {params_layer2}")

total_params_manual = params_layer1 + params_layer2
print(f"\nTotal parameters (manual): {total_params_manual}")

# Verify with PyTorch
total_params_pytorch = sum(p.numel() for p in model.parameters())
print(f"Total parameters (PyTorch): {total_params_pytorch}")
print(f"\nMatch: {total_params_manual == total_params_pytorch}")

# Detailed breakdown
print("\n" + "="*60)
print("Detailed parameter breakdown:")
print("="*60)
for name, param in model.named_parameters():
    print(f"{name:15s}: {str(param.shape):15s} -> {param.numel():4d} parameters")

## 5. Wrap-up Questions - Solutions

### Question 1: Why can't we use a linear activation function in hidden layers?

**Answer:** If we use linear activations in hidden layers, the entire network becomes equivalent to a single linear transformation, regardless of depth. This is because:
- Composition of linear functions is linear: $f_2(f_1(x)) = W_2(W_1 x + b_1) + b_2 = (W_2 W_1)x + (W_2 b_1 + b_2)$
- We lose the ability to learn non-linear decision boundaries
- A deep network with linear activations = shallow linear model
- Cannot solve problems like XOR that require non-linear boundaries

### Question 2: Why is ReLU more popular than sigmoid for hidden layers?

**Answer:** ReLU has several advantages over sigmoid:
1. **No vanishing gradient**: For z > 0, ReLU'(z) = 1 (constant), while sigmoid'(z) ‚â§ 0.25
2. **Computational efficiency**: ReLU is just max(0, z) - much faster than exponentials
3. **Sparse activation**: About 50% of neurons are zero, leading to sparse representations
4. **Empirically better performance**: Trains faster and achieves better results in practice

Main disadvantage: "Dead neurons" when z ‚â§ 0 always (can be addressed with Leaky ReLU)

### Question 3: Parameters for 3-layer MLP with dimensions [10, 50, 50, 5]

In [None]:
dims = [10, 50, 50, 5]

total = 0
print("Layer-wise parameter calculation:")
print("="*60)
for i in range(len(dims) - 1):
    weights = dims[i] * dims[i+1]
    biases = dims[i+1]
    layer_params = weights + biases
    total += layer_params
    print(f"Layer {i+1}: {dims[i]:3d} ‚Üí {dims[i+1]:3d}")
    print(f"  Weights: {dims[i]:3d} √ó {dims[i+1]:3d} = {weights:5d}")
    print(f"  Biases:  {biases:5d}")
    print(f"  Total:   {layer_params:5d}")
    print()

print("="*60)
print(f"Total parameters: {total}")

# Verify with PyTorch
test_model = SimpleMLP(10, 50, 5)
# Add another hidden layer to match [10, 50, 50, 5]
class ThreeLayerMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 50)
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 5)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

verify_model = ThreeLayerMLP()
pytorch_total = sum(p.numel() for p in verify_model.parameters())
print(f"\nVerification with PyTorch: {pytorch_total}")
print(f"Match: {total == pytorch_total}")

### Question 4: Can a single-layer perceptron solve XOR?

**Answer:** No, a single-layer perceptron cannot solve XOR. Here's why:

1. **Linear decision boundary**: A single-layer perceptron learns: $f(x) = w_1 x_1 + w_2 x_2 + b$
   - This defines a line (hyperplane) in the input space
   - Decision boundary is where $w_1 x_1 + w_2 x_2 + b = 0$

2. **XOR is not linearly separable**: 
   - Points (0,0) and (1,1) should be on one side (output 0)
   - Points (0,1) and (1,0) should be on the other side (output 1)
   - No single line can separate these two groups

3. **Proof by contradiction**: Try any line - it will always misclassify at least one point

4. **Solution**: Need at least one hidden layer with non-linear activation to create non-linear decision boundaries

In [None]:
# Demonstration: Single-layer perceptron on XOR
class SingleLayerPerceptron(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(2, 1)
    
    def forward(self, x):
        return self.fc(x)

single_layer = SingleLayerPerceptron()

print("Single-layer perceptron on XOR:")
print("\nNo matter what weights we set, we cannot solve XOR.")
print("The decision boundary is always a line, and XOR requires")
print("a non-linear boundary to separate the classes.")

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y.squeeze(), cmap='RdYlBu', 
           edgecolors='black', s=200, linewidths=2)

# Try to draw some lines - none work!
x_line = np.linspace(-0.5, 1.5, 100)
plt.plot(x_line, x_line, 'k--', label='Diagonal', linewidth=2)
plt.plot(x_line, -x_line + 1, 'r--', label='Anti-diagonal', linewidth=2)
plt.plot([0.5, 0.5], [-0.5, 1.5], 'g--', label='Vertical', linewidth=2)
plt.plot([-0.5, 1.5], [0.5, 0.5], 'b--', label='Horizontal', linewidth=2)

for i in range(len(X)):
    plt.text(X[i, 0], X[i, 1] + 0.15, f'y={int(y[i].item())}', 
            ha='center', fontsize=10, fontweight='bold')

plt.xlabel('x‚ÇÅ', fontsize=12)
plt.ylabel('x‚ÇÇ', fontsize=12)
plt.title('No Single Line Can Separate XOR Classes', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xlim(-0.5, 1.5)
plt.ylim(-0.5, 1.5)
plt.tight_layout()
plt.show()

## Summary

### Key Takeaways from Week 1:

1. **PyTorch Basics**
   - Tensors are the fundamental data structure
   - Operations like matmul, broadcasting work similarly to NumPy
   - Can move tensors to GPU and track gradients

2. **Activation Functions**
   - Non-linear activations are essential for learning complex patterns
   - ReLU is the most popular for hidden layers (no vanishing gradient)
   - Sigmoid/tanh have vanishing gradient problems

3. **Neural Networks**
   - MLPs stack linear layers with non-linear activations
   - Forward pass: repeatedly apply Linear ‚Üí Activation
   - Depth allows learning hierarchical representations

4. **Loss Functions**
   - MSE for regression, Cross-entropy for classification
   - Have probabilistic interpretations (maximum likelihood)

5. **Next Steps**
   - Next week: Backpropagation - how to compute gradients efficiently
   - Then: Training algorithms to actually optimize the network

Great work! üéâ