In [6]:
import numpy as np
import tensorflow as tf
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Initialize weights and biases for hidden layer
        self.weights_hidden = np.random.randn(self.input_size, self.hidden_size)
        self.bias_hidden = np.zeros((1, self.hidden_size))
        
        # Initialize weights and biases for output layer
        self.weights_output = np.random.randn(self.hidden_size, self.output_size)
        self.bias_output = np.zeros((1, self.output_size))
        
    def relu(self, z):
        # ReLU activation function
        return np.maximum(0, z)
    
    def softmax(self, z):
        # Softmax activation function
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def forward(self, X):
        # Forward pass through the network
        z_hidden = np.dot(X, self.weights_hidden) + self.bias_hidden
        a_hidden = self.relu(z_hidden)
        z_output = np.dot(a_hidden, self.weights_output) + self.bias_output
        a_output = self.softmax(z_output)
        
        return a_output
    
    def loss(self, X, y):
        # Compute the categorical cross-entropy loss
        a_output = self.forward(X)
        m = y.shape[0]
        
        loss = -np.sum(np.log(a_output[np.arange(m), y])) / m
        
        return loss
    
    def train(self, X, y, learning_rate, epochs, batch_size):
        # Train the network using mini-batch gradient descent
        m = X.shape[0]
        for epoch in range(epochs):
            # Shuffle the data
            permutation = np.random.permutation(m)
            X_shuffled = X[permutation]
            y_shuffled = y[permutation]
            
            # Mini-batch gradient descent
            for i in range(0, m, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # Forward pass
                # a_output = self.forward(X_batch)
                z_hidden = np.dot(X_batch, self.weights_hidden) + self.bias_hidden
                a_hidden = self.relu(z_hidden)
                z_output = np.dot(a_hidden, self.weights_output) + self.bias_output
                a_output = self.softmax(z_output)
                
                # Backward pass
                dz_output = a_output - (y_batch == np.arange(self.output_size))
                dw_output = np.dot(a_hidden.T, dz_output)
                db_output = np.sum(dz_output, axis=0, keepdims=True)
                
                da_hidden = np.dot(dz_output, self.weights_output.T)
                dz_hidden = da_hidden * (z_hidden > 0)
                dw_hidden = np.dot(X_batch.T, dz_hidden)
                db_hidden = np.sum(dz_hidden, axis=0, keepdims=True)
                
                # Update weights and biases
                self.weights_output -= learning_rate * dw_output
                self.bias_output -= learning_rate * db_output
                self.weights_hidden -= learning_rate * dw_hidden
                self.bias_hidden -= learning_rate * db_hidden
                
            # Compute the loss on the entire dataset
            # loss = self.loss(X, y)
            loss = 0
            print(f"Epoch {epoch+1}/{epochs}, loss = {loss:.4f}")

In [15]:
mnist = tf.keras.datasets.mnist

In [3]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = x_train/255
X_test = x_test/255

flattened_images = [image.flatten() for image in X_train]
X_train = np.stack(flattened_images, axis=0)

flattened_images = [image.flatten() for image in X_test]
X_test = np.stack(flattened_images, axis=0)

In [4]:
y_train_onehot[0]

NameError: name 'y_train_onehot' is not defined

In [17]:
# Load the training data
# X_train = np.load("X_train.npy") # shape (num_examples, input_size)
# y_train = np.load("y_train.npy") # shape (num_examples,)
num_classes = 10
# Convert the labels to one-hot encoding
y_train_int = [int(label) for label in y_train]
y_train_onehot = np.eye(num_classes)[y_train_int]

# y_train_onehot = np.eye(num_classes)[y_train] # shape (num_examples, output_size)

# Create the neural network
input_size = X_train.shape[1]
hidden_size = 128
output_size = num_classes
learning_rate = 0.2
epochs = 10
batch_size = 32
model = NeuralNetwork(input_size, hidden_size, output_size)

# Train the neural network
model.train(X_train, y_train_onehot, learning_rate, epochs, batch_size)

Epoch 1/10, loss = 0.0000
Epoch 2/10, loss = 0.0000
Epoch 3/10, loss = 0.0000
Epoch 4/10, loss = 0.0000
Epoch 5/10, loss = 0.0000
Epoch 6/10, loss = 0.0000
Epoch 7/10, loss = 0.0000
Epoch 8/10, loss = 0.0000
Epoch 9/10, loss = 0.0000
Epoch 10/10, loss = 0.0000


In [24]:
model.forward(X_train[400])
# y_train[2]

array([[0.84597564, 0.15402436, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])