### Write a function to implement logistic regression from scratch using gradient descent.

Key changes made from linear regression to logistic regression:

- Added sigmoid activation function
- Changed the cost function from MSE to Binary Cross-Entropy
- Modified the forward pass to use sigmoid activation
- Updated the example to use binary labels (0 and 1)
- Added probability and class predictions at the end
- Added epsilon to avoid log(0) in cost calculation

The main differences in the math are:

- Linear regression predicts continuous values, while logistic regression predicts probabilities between 0 and 1
- The cost function is now binary cross-entropy instead of mean squared error
- The predictions go through a sigmoid function to squash them between 0 and 1
- The final predictions can be either probabilities or binary classes (using 0.5 as threshold)

`Binary Classification:`

In [None]:
import numpy as np

def sigmoid(z):
  """Sigmoid activation function"""
  return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, learning_rate=0.01, num_iterations=1000, tolerance=1e-6):
  # Get dimensions
  n_samples, n_features = X.shape
  
  # Initialize parameters
  weights = np.zeros(n_features)
  bias = 0
  
  # Store costs for monitoring convergence
  costs = []
  
  for iteration in range(num_iterations):
      # Forward pass
      z = np.dot(X, weights) + bias
      y_pred = sigmoid(z)
      
      # Calculate cost (Binary Cross-Entropy)
      eps = 1e-15
      y_pred = np.clip(y_pred, eps, 1 - eps)
      cost = - (1 / n_samples) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
      costs.append(cost)
      
      # Calculate gradients
      d_weights = (1/n_samples) * np.dot(X.T, (y_pred - y))
      d_bias = (1/n_samples) * np.sum(y_pred - y)
      
      # Update parameters
      weights -= learning_rate * d_weights
      bias -= learning_rate * d_bias
      
      # Check convergence
      if iteration > 0 and abs(costs[-1] - costs[-2]) < tolerance:
          print(f"Converged at iteration {iteration}")
          break
          
  return weights, bias, costs

# Example usage with normalized features
# Note: For logistic regression, y should be binary (0 or 1)
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
y = np.array([0, 0, 1, 1])  # Binary labels

# Normalize features
X_normalized = (X - X.mean(axis=0)) / X.std(axis=0)

# Train model
weights, bias, costs = logistic_regression(X_normalized, y)

# Print results
print("Weights:", weights)
print("Bias:", bias)
print("Final cost:", costs[-1])

# Make predictions
z = np.dot(X_normalized, weights) + bias
y_pred_prob = sigmoid(z)  # Probability predictions
y_pred_class = (y_pred_prob >= 0.5).astype(int)  # Class predictions (0 or 1)
print("Probability predictions:", y_pred_prob)
print("Class predictions:", y_pred_class)

`Multi-Class Classification:`

In [None]:
import numpy as np

def softmax(z):
    """Softmax activation function"""
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def one_hot_encode(y, num_classes):
    """One-hot encode the labels"""
    return np.eye(num_classes)[y]

def logistic_regression(X, y, learning_rate=0.01, num_iterations=1000, tolerance=1e-6):
    # Get dimensions
    n_samples, n_features = X.shape
    num_classes = np.max(y) + 1

    # Initialize parameters
    weights = np.zeros((n_features, num_classes))                       # REMEMBER
    bias = np.zeros(num_classes)                                        # REMEMBER

    # One-hot encode the labels
    y_one_hot = one_hot_encode(y, num_classes)                          # REMEMBER

    # Store costs for monitoring convergence
    costs = []

    for iteration in range(num_iterations):
        # Forward pass
        z = np.dot(X, weights) + bias
        y_pred = softmax(z)                                             # REMEMBER

        # Calculate cost (Categorical Cross-Entropy)
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)
        cost = - (1 / n_samples) * np.sum(y_one_hot * np.log(y_pred))   # REMEMBER
        costs.append(cost)

        # Calculate gradients
        d_weights = (1 / n_samples) * np.dot(X.T, (y_pred - y_one_hot))
        d_bias = (1 / n_samples) * np.sum(y_pred - y_one_hot, axis=0)

        # Update parameters
        weights -= learning_rate * d_weights
        bias -= learning_rate * d_bias

        # Check convergence
        if iteration > 0 and abs(costs[-1] - costs[-2]) < tolerance:
            print(f"Converged at iteration {iteration}")
            break

    return weights, bias, costs

# Example usage with normalized features
# Note: For multi-class logistic regression, y should be integer labels (0, 1, 2, ...)
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
y = np.array([0, 0, 1, 1, 2, 2])  # Multi-class labels

# Normalize features
X_normalized = (X - X.mean(axis=0)) / X.std(axis=0)

# Train model
weights, bias, costs = logistic_regression(X_normalized, y)

# Print results
print("Weights:", weights)
print("Bias:", bias)
print("Final cost:", costs[-1])

# Make predictions
z = np.dot(X_normalized, weights) + bias
y_pred_prob = softmax(z)  # Probability predictions
y_pred_class = np.argmax(y_pred_prob, axis=1)  # Class predictions
print("Probability predictions:", y_pred_prob)
print("Class predictions:", y_pred_class)