# Assignment 2 (Logical AND Gate and Initial Parameters):

- Useful functions:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
    """returns the sigmoid function value for the input x.

    Args:
        x (numpy.ndarray): numpy array of real numbers

    Returns:
        numpy.ndarray: sigmoid function value for each element in x
    """
    return 1 / (1 + np.exp(-x))

def dsigmoid(x):
    """returns the derivative of the sigmoid function for the input x.

    Args:
        x (numpy.ndarray): numpy array of real numbers

    Returns:
        numpy.ndarray: derivative of the sigmoid function for each element in x
    """
    sig = sigmoid(x)
    return sig * (1 - sig)

def update_sgd(weights, x, y, learning_rate):
    """Performs a single epoch of stochastic gradient descent.

    Args:
        weights (numpy.ndarray): current weights
        x (numpy.ndarray): input data
        y (numpy.ndarray): target values
        learning_rate (float): learning rate for the update

    Returns:
        numpy.ndarray: updated weights
    """
    new_weights = np.copy(weights) # Create a copy of weights to avoid modifying the original
    for k in range(len(x)):        # Iterate over each sample
        x_k = x[k]                 # Get the k-th input sample
        y_k = y[k]                 # Get the k-th target value
        z = np.dot(weights, x_k)   # Compute the linear combination
        error = y_k - sigmoid(z)   # Calculate the error
        gradient = dsigmoid(z) * error  # Compute the gradient
        new_weights += learning_rate * gradient * x_k  # Update the weights
    return new_weights

def update_batched_gd(weights, x, y, learning_rate):
    """Performs a single epoch of batched gradient descent.

    Args:
        weights (numpy.ndarray): current weights
        x (numpy.ndarray): input data
        y (numpy.ndarray): target values
        learning_rate (float): learning rate for the update

    Returns:
        numpy.ndarray: updated weights
    """
    z = np.dot(x, weights)            # Compute the linear combination for all samples
    errors = y - sigmoid(z)           # Calculate the errors for all samples
    gradients = dsigmoid(z) * errors  # Compute the gradients for all samples
    new_weights = weights + learning_rate * np.dot(x.T, gradients) / x.shape[0]  # Update the weights
    return new_weights

def train_sgd(weights, x, y, epochs, learning_rate):
    """Trains the model using stochastic gradient descent.

    Args:
        weights (numpy.ndarray): initial weights
        x (numpy.ndarray): input data
        y (numpy.ndarray): target values
        epochs (int): number of epochs to train
        learning_rate (float): learning rate for the update

    Returns:
        numpy.ndarray: final weights after training
        numpy.ndarray: error history
    """
    trained_weights = np.copy(weights)  # Create a copy of weights to avoid modifying the original
    error_history = np.zeros(epochs)    # Initialize error history
    for epoch in range(epochs):
        trained_weights = update_sgd(trained_weights, x, y, learning_rate)  # Update weights
        z = np.dot(x, trained_weights)  # Compute the linear combination
        error = y - sigmoid(z)          # Calculate the error
        error_history[epoch] = np.mean(error ** 2)  # Store the mean square error
        
    return trained_weights, error_history  # Return final weights and error history

def train_batched_gd(weights, x, y, epochs, learning_rate):
    """Trains the model using batched gradient descent.

    Args:
        weights (numpy.ndarray): initial weights
        x (numpy.ndarray): input data
        y (numpy.ndarray): target values
        epochs (int): number of epochs to train
        learning_rate (float): learning rate for the update

    Returns:
        numpy.ndarray: final weights after training
        numpy.ndarray: error history
    """
    trained_weights = np.copy(weights)  # Create a copy of weights to avoid modifying the original
    error_history = np.zeros(epochs)    # Initialize error history
    for epoch in range(epochs):
        trained_weights = update_batched_gd(trained_weights, x, y, learning_rate)  # Update weights
        z = np.dot(x, trained_weights)  # Compute the linear combination
        error = y - sigmoid(z)          # Calculate the error
        error_history[epoch] = np.mean(error ** 2)  # Store the mean square error
        
    return trained_weights, error_history  # Return final weights and error history

def inference(weights, x):
    """Performs inference using the trained weights.

    Args:
        weights (numpy.ndarray): trained weights
        x (numpy.ndarray): input data

    Returns:
        numpy.ndarray: predicted values
    """
    z = np.dot(x, weights)  # Compute the linear combination
    return sigmoid(z)       # Return the sigmoid activation of the linear combination

def plot_error_historys(error_history_sgd, error_history_gd):
    """Plots the error history of two training methods.

    Args:
        error_history_sgd (numpy.ndarray): error history for the first method
        error_history_gd (numpy.ndarray): error history for the second method
    """
    plt.plot(error_history_sgd, label='Stochastic Gradient Descent')
    plt.plot(error_history_gd, label='Batched Gradient Descent')
    plt.xlabel('Epochs')
    plt.ylabel('Mean Squared Error')
    plt.title('Error History Comparison')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
def plot_surface(func, x_range, y_range, title='Surface plot of the function'):
    """Plots the surface of a function over a specified range.

    Args:
        func (Callable[[float, float], float]): function to plot f:R^2 -> R
        x_range (tuple): range for x-axis (min, max)
        y_range (tuple): range for y-axis (min, max)
        title (str): title of the plot
    """
    x = np.linspace(x_range[0], x_range[1], 100)
    y = np.linspace(y_range[0], y_range[1], 100)
    X, Y = np.meshgrid(x, y)
    Z = np.vectorize(func)(X, Y)

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.plot_surface(X, Y, Z, cmap='viridis')

    ax.set_xlabel('X axis')
    ax.set_ylabel('Y axis')
    ax.set_zlabel('function value')
    plt.title(title)
    plt.tight_layout()
    plt.show()

- Data for AND gate problem:

In [None]:
# gate inputs
# last column is bias term
X = np.array([
    [0, 0, 1],
    [0, 1, 1],
    [1, 0, 1],
    [1, 1, 1]
])
Y = np.array([0, 0, 0, 1])  # AND gate

2-1) [10 pts] Run the given code without any modification.

In [None]:
initial_weight = np.array([3332322.1, -323256.6, 772213.0])
epochs = 100
learning_rate = 0.01

trained_weights_sgd, error_history_sgd = train_sgd(initial_weight, X, Y, epochs, learning_rate)
trained_weights_gd, error_history_gd = train_batched_gd(initial_weight, X, Y, epochs, learning_rate)

# inference
for x, y in zip(X, Y):
    prediction_sgd = inference(trained_weights_sgd, x)
    prediction_gd = inference(trained_weights_gd, x)
    print(f"Input: [{x[0].item()}, {x[1].item()}], Target: {y.item()}, SGD Prediction: {prediction_sgd}, GD Prediction: {prediction_gd}")
    
# Plot error history
plot_error_historys(error_history_sgd, error_history_gd)

2-2) [10 pts] Modify the initial weight as `[0.68, 0.01, 0.73]` and run the code. 

In [None]:
# copy and paste from 2-1 and modify

# TODO

2-3) Modify the learning rate to `lr = 0.9` and run the code. Then, modify the
learning rate to `lr = 100.0` and run the code. use the initial weight of 2-2) again.

In [None]:
# lr = 0.9

# TODO


In [None]:
# lr = 100.0

# TODO


2-4) Run the code after changing the epoch to `100000`.

In [None]:
# lr = 100000.0

# TODO


2-5) Run the code using the parameters below. Now let’s compare batched gradient descent and stochastic gradient descent. Which one shows faster error convergence? Which one shows better minimized error?
```python
initial_weight = np.array([0.68, 0.01, 0.73])
epochs = 1000
learning_rate = 0.9
```

In [None]:

# TODO


2-6) Modify the given code and plot loss the with some fixed bias value, $w_3=0, -5, -20, -100, -500$. When changing $w_3$, it is recommended to modify the $x$, $y$ range to $(-5, 1.5 \times (-w_3))$ as shown in the code provided. What is the difference between our homework and typical optimization problem?

In [None]:
# trained weights
print("Trained weights (SGD):", trained_weights_sgd)
print("Trained weights (GD):", trained_weights_gd)

In [None]:
def loss(w1, w2, w3):
    weights = np.array([w1, w2, w3])
    z = np.dot(X, weights)
    error = Y - sigmoid(z)
    return np.mean(error ** 2) 

w3 = -5
plot_surface(lambda w1, w2: loss(w1, w2, w3), (-5, 1.5 * (-w3) + 5), (-5, 1.5 * (-w3) + 5))