## PCA & Autoencoder

### import required libraries

In [1]:
import pandas as pd
import numpy as np

### Data Loading

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
column_names = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]

df = pd.read_csv(url, names=column_names)

X = df.iloc[:, 2:].values
y = df['Diagnosis'].values

y = np.where(y == 'M', 1, 0)

#### checking null values

In [3]:
np.isnan(X).sum()

np.int64(0)

#### Standardizing Data

In [4]:
def standardize_data(X):

  mean = np.mean(X, axis=0)
  std = np.std(X, axis=0)

  std[std == 0] = 1.0
  X_std = (X - mean) / std

  return X_std

In [5]:
X = standardize_data(X)

In [6]:
print("Data shape: ", X.shape)
print("mean: ", np.mean(X))
print("std: ", np.std(X))

Data shape:  (569, 30)
mean:  -6.826538293184326e-17
std:  1.0


## 1. PCA implementation from scratch

In [7]:
class PCA:
  def __init__(self, n_components):
    self.n_components = n_components
    self.components = None # Principal components
    self.mean = None
    self.explained_variance = None # Eigenvalues
    self.explained_variance_ratio = None # Ratio of variance explained


  def fit(self, X):

    # 1. Calculate the mean of the data (needed for centering)
    self.mean = np.mean(X, axis=0)

    # 2. Center the data (subtract the mean)
    X_centered = X - self.mean

    # 3. Compute the covariance matrix
    cov_matrix = np.cov(X_centered, rowvar=False)

    # 4. Perform Eigenvalue Decomposition
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # 5. Sort Eigenvalues and Eigenvectors in Descending Order
    # eigh returns them in ascending order, so we reverse them
    sorted_indices = np.argsort(eigenvalues)[::-1]

    self.explained_variance = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    # 6. Store the first 'n_components' eigenvectors
    # These are the directions of maximum variance (Principal Components)
    self.components = sorted_eigenvectors[:, :self.n_components]

    # 7. Calculate Explained Variance Ratio

    total_variance = np.sum(self.explained_variance)
    self.explained_variance_ratio = (
        self.explained_variance[:self.n_components] / total_variance
    )

    return self

  def transform(self, X):
    if self.components is None:
      raise Exception("PCA must be fit before transform")

    X_centered = X - self.mean
    return np.dot(X_centered, self.components)

  def inverse_transform(self, X):
    return np.dot(X, self.components.T) + self.mean

  def get_reconstruction_error(self, X):
    X_transformed = self.transform(X)
    X_reconstructed = self.inverse_transform(X_transformed)

    mse = np.mean(np.sum((X - X_reconstructed) ** 2, axis=1))
    return mse


### PCA Class Validation

In [8]:
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
error = pca.get_reconstruction_error(X)

print(f"Original Data Shape: {X.shape}")
print(f"Reduced Data Shape:  {X_pca.shape}")
print(f"Explained Variance Ratio: {pca.explained_variance_ratio}")
print(f"Reconstruction Error (MSE): {error:.6f}")


Original Data Shape: (569, 30)
Reduced Data Shape:  (569, 2)
Explained Variance Ratio: [0.44272026 0.18971182]
Reconstruction Error (MSE): 11.027038


## 2. Autoencoder

#### Activation functions

In [9]:
def sigmoid(x):
  x = np.clip(x, -500, 500)
  return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
  s = sigmoid(x)
  return s * (1 - s)

def tanh(x):
  return np.tanh(x)

def tanh_deravative(x):
  return 1 - np.tanh(x)**2

def relu(x):
  return np.maximum(0, x)

def relu_derivative(x):
  return (x > 0).astype(float)

In [10]:
from re import L
class Autoencoder:
  def __init__(self, input_dim, hidden_layers, bottleneck_dim, activation='relu', learning_rate=0.01, lambda_l2=0.01):
    self.learning_rate = learning_rate
    self.initial_lr = learning_rate
    self.lambda_l2 = lambda_l2

    if activation == 'relu':
      self.act = relu
      self.act_derivative = relu_derivative
    elif activation == 'sigmoid':
      self.act = sigmoid
      self.act_derivative = sigmoid_derivative
    elif activation == 'tanh':
      self.act = tanh
      self.act_derivative = tanh_deravative
    else:
      raise Exception("Unsupported activation")

    layer_dims = [input_dim] + hidden_layers + [bottleneck_dim] + hidden_layers[::-1] + [input_dim]
    self.weights = []
    self.biases = []

    for i in range(len(layer_dims) - 1):
      n_in = layer_dims[i]
      n_out = layer_dims[i + 1]

      scale = np.sqrt(2.0 / n_in)

      W = np.random.randn(n_in, n_out) * scale
      b = np.zeros((1, n_out))

      self.weights.append(W)
      self.biases.append(b)

  def forward(self, X):
    self.Z_store = []
    self.A_store = [X]

    A = X

    for i in range(len(self.weights) - 1):
      Z = np.dot(A, self.weights[i]) + self.biases[i]
      A = self.act(Z)

      self.Z_store.append(Z)
      self.A_store.append(A)

    last_idx = len(self.weights) - 1
    Z_final = np.dot(A, self.weights[last_idx]) + self.biases[last_idx]
    A_final = Z_final

    self.Z_store.append(Z_final)
    self.A_store.append(A_final)

    return A_final

  def backward(self, X, Y_pred):
    m = X.shape[0]

    grads_W = []
    grads_b = []

    dZ = (Y_pred - X) / m


    num_layers = len(self.weights)

    for i in reversed(range(num_layers)):
      A_prev = self.A_store[i]
      W = self.weights[i]

      dW = np.dot(A_prev.T, dZ) + (self.lambda_l2 * W)

      db = np.sum(dZ, axis=0, keepdims=True)


      grads_W.insert(0, dW)
      grads_b.insert(0, db)


      if(i > 0):
        dZ = np.dot(dZ, W.T)
        dZ = dZ * self.act_derivative(self.Z_store[i - 1])
    return grads_W, grads_b


  def update_parameters(self, grads_W, grads_b):
    for i in range(len(self.weights)):
      self.weights[i] -= self.learning_rate * grads_W[i]
      self.biases[i] -= self.learning_rate * grads_b[i]

  def get_bottleneck(self, X):
    bottleneck_layer_idx = (len(self.weights) // 2)

    A = X
    for i in range(bottleneck_layer_idx):
        Z = np.dot(A, self.weights[i]) + self.biases[i]
        A = self.act(Z)

    return A

  def fit(self, X, epochs=100, batch_size=32):
        """
        Training loop with Mini-Batch GD and Learning Rate Scheduling.
        """
        loss_history = []

        for epoch in range(epochs):
            # Shuffle data
            indices = np.arange(X.shape[0])
            np.random.shuffle(indices)
            X_shuffled = X[indices]

            epoch_loss = 0

            # Mini-batch loop
            for start_idx in range(0, X.shape[0], batch_size):
                end_idx = min(start_idx + batch_size, X.shape[0])
                batch_X = X_shuffled[start_idx:end_idx]

                # 1. Forward
                Y_pred = self.forward(batch_X)

                # 2. Compute Loss (MSE + L2)
                mse_loss = np.mean((batch_X - Y_pred) ** 2)
                l2_loss = 0
                for W in self.weights:
                    l2_loss += np.sum(W ** 2)

                total_loss = mse_loss + (0.5 * self.lambda_l2 * l2_loss)
                epoch_loss += total_loss

                # 3. Backward
                grads_W, grads_b = self.backward(batch_X, Y_pred)

                # 4. Update
                self.update_parameters(grads_W, grads_b)

            # Average loss for the epoch
            avg_loss = epoch_loss / (X.shape[0] / batch_size)
            loss_history.append(avg_loss)

            # Learning Rate Scheduling (Decay)
            # Example: Decay by 5% every 10 epochs
            if epoch % 10 == 0 and epoch > 0:
                self.learning_rate *= 0.95

            if epoch % 10 == 0:
                print(f"Epoch {epoch}/{epochs} - Loss: {avg_loss:.4f}")

        return loss_history


In [11]:
# 1. Define Architecture
# Input: 30 features
# Hidden: [20, 10] -> This creates layers: 30->20->10->Bottleneck->10->20->30
# Bottleneck: 5
ae = Autoencoder(
    input_dim=X.shape[1],
    hidden_layers=[20, 10],
    bottleneck_dim=5,
    activation='relu',
    learning_rate=0.01
)

# 2. Train
print("Training Autoencoder...")
loss_curve = ae.fit(X, epochs=50, batch_size=32)

# 3. Get Compressed Data
X_encoded = ae.get_bottleneck(X)

print(f"Encoded Shape: {X_encoded.shape}") # Should be (569, 5)
print(f"Final Loss: {loss_curve[-1]:.4f}")

Training Autoencoder...
Epoch 0/50 - Loss: 2.0776
Epoch 10/50 - Loss: 1.4532
Epoch 20/50 - Loss: 1.3072
Epoch 30/50 - Loss: 1.1765
Epoch 40/50 - Loss: 1.1078
Encoded Shape: (569, 5)
Final Loss: 1.0928
