In [None]:
import numpy as np
from utils import forward, random_init, backward
from numpy import linalg as LA
from keras.datasets import mnist
from utils import onehot, crossentropy
import copy

class GradCheck:
    def __init__(self, X, Y, layers_dim, epsilon):
        self.layers_dim = layers_dim
        self.X = X
        self.Y = Y
        self.epsilon = epsilon

    def initialize_params(self):
        parameters = random_init(layers_dim)
        return parameters
    
    def dictionary_to_vectors(self, parameters):
        cache = []
        for _, parameter_values in parameters.items():
            flat_mat = parameter_values.flatten()
            cache.append(flat_mat)
        
        return np.concatenate(cache)
    
    def vectors_to_dictionary(self, vector):
        parameters = {}
        start = 0
        for l in range(1, len(self.layers_dim)):
            W_shape = (self.layers_dim[l], self.layers_dim[l-1])
            W_size = np.prod(W_shape)
            b_shape = (self.layers_dim[l], 1)
            b_size = np.prod(b_shape)

            parameters[f"W{l}"] = vector[start:start+W_size].reshape(W_shape)
            start += W_size
            parameters[f"b{l}"] = vector[start:start+b_size].reshape(b_shape)
            start += b_size

        return parameters

    def gradients_zero_like(self, grad_true, parameter_vectorized, epsilon, verbose= True):
        # compute numerical gradient for each parameter
        gradapprox = np.zeros_like(grad_true)
        total_params = parameter_vectorized.shape[0]
        original_parameters = copy.deepcopy(parameter_vectorized)
        count = 0
        count_down = max(1, grad_true.shape[0] // 100)

        for i in range(gradapprox.shape[0]):

            # theta nudged up by epsilon
            theta_plus = copy.deepcopy(original_parameters)

            theta_plus[i] += epsilon
            parameters_plus = self.vectors_to_dictionary(theta_plus)
            y_pred, _ = forward(self.X, parameters_plus)
            J_plus = crossentropy(y_pred = y_pred, y_true = self.Y) # obtaining J( theta + epsilon )
        
            # theta nudged down by epsilon
            theta_minus = copy.deepcopy(original_parameters)

            theta_minus[i] -= epsilon
            parameters_minus = self.vectors_to_dictionary(theta_minus)
            y_pred, _ = forward(self.X, parameters_minus)
            J_minus = crossentropy(y_pred = y_pred, y_true = self.Y) # obtaining J( theta - epsilon)

            # calculating numerical gradients for ith
            gradapprox[i] = (J_plus - J_minus) / (2*epsilon)
            #print(gradapprox[i])

            # verbose
            if verbose == True:
                if i % 100 == 0:
                    count += 1
                    count_down -=1
                    grad_true_subset = grad_true[:i+1]
                    gradapprox_subset = gradapprox[:i+1]
                    numerator = LA.norm((gradapprox_subset - grad_true_subset), ord = 2 )
                    denominator = LA.norm(gradapprox_subset, ord = 2) + LA.norm(grad_true_subset, ord = 2)
                    diff = numerator/denominator
                    
                    print(f"Gradient difference at iteration {i}: {diff:.6e}")
                    #print(f"Grad diff without norm {grad_true_subset - gradapprox_subset}")
                    print(f"Processed {i} / {total_params} parameters")
                    print("#" * count + "_"*count_down)

        return gradapprox 
            
    def gradient_checker(self):

        # loading in randomly intialized parameters
        parameters = self.initialize_params()

        # converting dictionary to vectors
        parameters_vectorized = self.dictionary_to_vectors(parameters)
    
        # caching original forward prop parameters for backprop gradients at next step
        parameters_original = self.vectors_to_dictionary(parameters_vectorized)
        _, cache = forward(self.X, parameters_original)

        # caching backprop gradients 
        grad = backward(y_true = self.Y, cache = cache, params= parameters)
        grad_true = self.dictionary_to_vectors(grad)
        
        gradapprox = self.gradients_zero_like(grad_true= grad_true, parameter_vectorized= parameters_vectorized, epsilon= self.epsilon)
        #gradapprox = np.array([gradapprox]) # converted into a numpy array to solve shape problems during broadcasting
        #print(gradapprox)
        # gradient numerical approximation
        numerator = LA.norm((gradapprox - grad_true), ord = 2 )
        denominator = LA.norm(gradapprox, ord = 2) + LA.norm(grad_true, ord = 2)
        diff = numerator/denominator
        #print(grad_true.shape, gradapprox.shape)
        return diff
    
# Load and preprocess data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize
X_train = X_train.reshape(X_train.shape[0], -1).T / 255.0
X_test = X_test.reshape(X_test.shape[0], -1).T / 255.0

# Parameters
layers_dim = [784, 128, 64, 10]
y_test_oh = onehot(y_test, 10)

# Use a smaller subset for gradient checking (it's computationally expensive)
#X_subset = X_train[:, :3]  # Only first 5 samples
#y_subset = y_train_oh[:, :3]

gradcheck = GradCheck(X_test, y_test_oh, layers_dim, epsilon=1e-5)
diff = gradcheck.gradient_checker()

In [None]:
import numpy as np
from utils import forward, random_init, backward
from numpy import linalg as LA
from keras.datasets import mnist
from utils import onehot, crossentropy
import copy

class GradCheck:
    def __init__(self, X, Y, layers_dim, epsilon):
        self.layers_dim = layers_dim
        self.X = X
        self.Y = Y
        self.epsilon = epsilon

    def initialize_params(self):
        parameters = random_init(self.layers_dim)  # Use self.layers_dim
        return parameters
    
    def dictionary_to_vectors(self, parameters):
        cache = []
        for _, parameter_values in parameters.items():
            flat_mat = parameter_values.flatten()
            cache.append(flat_mat)
        
        return np.concatenate(cache)
    
    def vectors_to_dictionary(self, vector):
        parameters = {}
        start = 0
        for l in range(1, len(self.layers_dim)):  # Use self.layers_dim
            W_shape = (self.layers_dim[l], self.layers_dim[l-1])
            W_size = np.prod(W_shape)
            b_shape = (self.layers_dim[l], 1)
            b_size = np.prod(b_shape)

            parameters[f"W{l}"] = vector[start:start+W_size].reshape(W_shape)
            start += W_size
            parameters[f"b{l}"] = vector[start:start+b_size].reshape(b_shape)
            start += b_size

        return parameters

    def gradients_zero_like(self, grad_true, parameter_vectorized, epsilon, verbose=True):
        # compute numerical gradient for each parameter
        gradapprox = np.zeros_like(grad_true)
        total_params = parameter_vectorized.shape[0]
        
        for i in range(gradapprox.shape[0]):
            # theta nudged up by epsilon
            theta_plus = copy.deepcopy(parameter_vectorized)
            theta_plus[i] = parameter_vectorized[i] + epsilon
            parameters_plus = self.vectors_to_dictionary(theta_plus)
            y_pred, _ = forward(self.X, parameters_plus)
            J_plus = crossentropy(y_pred=y_pred, y_true=self.Y)
            
            # theta nudged down by epsilon
            theta_minus = copy.deepcopy(parameter_vectorized)
            theta_minus[i] = parameter_vectorized[i] - epsilon
            parameters_minus = self.vectors_to_dictionary(theta_minus)
            y_pred, _ = forward(self.X, parameters_minus)
            J_minus = crossentropy(y_pred=y_pred, y_true=self.Y)

            # calculating numerical gradients for ith parameter
            gradapprox[i] = (J_plus - J_minus) / (2 * epsilon)

            # verbose output - FIXED VERSION
            if verbose and i % 20 == 0:
                # Use only the computed subset for comparison
                grad_true_subset = grad_true[:i+1]
                gradapprox_subset = gradapprox[:i+1]
                
                # Calculate difference using ONLY the subset
                numerator = LA.norm(gradapprox_subset - grad_true_subset, ord=2)
                denominator = LA.norm(gradapprox_subset, ord=2) + LA.norm(grad_true_subset, ord=2)
                
                # Avoid division by zero
                if denominator == 0:
                    diff = 0 if numerator == 0 else float('inf')
                else:
                    diff = numerator / denominator
                
                print(f'Grad Approx {i} = {gradapprox[i]:.6e}')
                print(f'Backprop grad {i} = {grad_true[i]:.6e}')
                print(f"Gradient difference at iteration {i}: {diff:.6e}")
                print(f"Processed {i+1} / {total_params} parameters")
                progress_bar = "#" * (20 * (i+1) // total_params) + "_" * (20 - (20 * (i+1) // total_params))
                print(f"[{progress_bar}]")
                print()

        return gradapprox 
            
    def gradient_checker(self):
        # loading in randomly initialized parameters
        parameters = self.initialize_params()

        # converting dictionary to vectors
        parameters_vectorized = self.dictionary_to_vectors(parameters)
    
        # caching original forward prop parameters for backprop gradients at next step
        parameters_original = self.vectors_to_dictionary(parameters_vectorized)
        _, cache = forward(self.X, parameters_original)

        # caching backprop gradients 
        grad = backward(y_true=self.Y, cache=cache, params=parameters)
        grad_true = self.dictionary_to_vectors(grad)
        
        gradapprox = self.gradients_zero_like(
            grad_true=grad_true, 
            parameter_vectorized=parameters_vectorized, 
            epsilon=self.epsilon
        )
        
        # Final gradient numerical approximation using FULL vectors
        numerator = LA.norm(gradapprox - grad_true, ord=2)
        denominator = LA.norm(gradapprox, ord=2) + LA.norm(grad_true, ord=2)
        
        if denominator == 0:
            diff = 0 if numerator == 0 else float('inf')
        else:
            diff = numerator / denominator
            
        print(f"\nFINAL GRADIENT CHECK RESULT: {diff:.6e}")
        
        # Interpretation
        if diff < 1e-7:
            print("✅ Excellent! Gradients are very close.")
        elif diff < 1e-5:
            print("✅ Good! Gradients are reasonably close.")
        elif diff < 1e-3:
            print("⚠️  Warning: Gradients have some discrepancy.")
        else:
            print("❌ Error: Gradients are significantly different.")
        
        return diff

# Load and preprocess data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize
X_train = X_train.reshape(X_train.shape[0], -1).T / 255.0
X_test = X_test.reshape(X_test.shape[0], -1).T / 255.0

# Parameters
layers_dim = [784, 4, 3, 10]
y_train_oh = onehot(y_train, 10)

# Use a smaller subset for gradient checking (it's computationally expensive)
X_subset = X_train[:, :5]  # Only first 5 samples
y_subset = y_train_oh[:, :5]

gradcheck = GradCheck(X_subset, y_subset, layers_dim, epsilon=1e-4)
diff = gradcheck.gradient_checker()

In [None]:
diff

In [None]:
# Load and preprocess data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Normalize
X_train = X_train.reshape(X_train.shape[0], -1).T / 255.0
X_test = X_test.reshape(X_test.shape[0], -1).T / 255.0

# Parameters
layers_dim = [784, 4, 3, 10]
y_train_oh = onehot(y_train, 10)

gradcheck = GradCheck(X_train, y_train_oh, layers_dim, epsilon= 1e-4)

parameters = gradcheck.initialize_params()
for k, _ in parameters.items():
    print(_)
parameters_vectorized = gradcheck.dictionary_to_vectors(parameters)
print("------------------------------------------------")
# Assuming you have initial parameters dictionary params_dict

params_reconstructed = gradcheck.vectors_to_dictionary(parameters_vectorized)

# Now compare all weights and biases
for key in parameters:
    print(params_reconstructed[key])