To do:
- computational graph?
- dimensions

Learned:
- numpy arrays passed into ML models, not df's
- standard dimensions are W (n_out, n_in) or (neurons,features), so transpose X, typical
- Use dictionaries to cache values and access activation/cost functions their and grads
- 

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

cost functions

In [10]:
def MSE(y_pred,Y):
    return ((y_pred - Y)**2)/Y.shape[0]

def MSE_grad(y_pred,Y):
    return y_pred - Y

def BCE(y_pred, Y):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return - (1/Y.shape[0]) * np.sum(Y * np.log(y_pred) + (1 - Y) * np.log(1 - y_pred))

def BCE_grad(y_pred, Y):
    return y_pred - Y

def CCE(y_pred,Y):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    num_classes = y_pred.shape[1]
    Y_labels = np.eye(num_classes)[Y]
    return -(1 / Y.shape[0]) * np.sum(Y_labels * np.log(y_pred))

def CCE_grad(y_pred,Y):
    num_classes = y_pred.shape[1]
    Y_labels = np.eye(num_classes)[Y]
    return y_pred - Y_labels

cost_grad = {MSE: MSE_grad, BCE: BCE_grad, CCE: CCE_grad}

activation functions and grad

In [11]:
def ReLU(z):
    return np.maximum(0,z)

def ReLU_grad(z):
    return np.where(z > 0, 1, 0)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_grad(z):
    return sigmoid(z)*(1-sigmoid(z))

def Tanh(z):
    return (2 / (1 + np.exp(-2*z))) - 1

def Tanh_grad(z):
    return 1-(Tanh(z))**2

activation_grad = {ReLU: ReLU_grad, sigmoid: sigmoid_grad, Tanh: Tanh_grad}

Xavier initialization

In [12]:
def xavier_unif(n_out, n_in):
    limit = np.sqrt(6 / (n_out + n_in))
    weights = np.random.uniform(-limit, limit, size=(n_out, n_in))
    return weights

def xavier_norm(n_out, n_in):
    stddev = np.sqrt(2 / (n_out + n_in))
    weights = np.random.normal(0, stddev, size=(n_out, n_in))
    return weights

HE initialization

In [13]:
def HE_unif(n_out, n_in):
    limit = np.sqrt(6 / (n_in))
    weights = np.random.uniform(-limit, limit, size=(n_out, n_in))
    return weights

def HE_norm(n_out, n_in):
    stddev = 2/n_in
    weights = np.random.normal(0, stddev, size=(n_out,n_in))
    return weights

In [14]:
class MLP:
    def __init__(self,learning_rate = .0001, max_epochs = 500, batch_size = 64, convergence_level = 1e-5):
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.convergence_level = convergence_level
        self.cache = None
        self.cost_storage = []
        self.layers = []
    
    def fit(self, X_train, Y_train, layer_sizes, activation = ReLU, init_method = HE_norm, cost_function = CCE):
        self.cache = {}
        self.activation = activation
        self.init = init_method
        self.cost_function = cost_function
        
        for i in range(len(layer_sizes)-1):
            size_in = layer_sizes[i]
            size_out = layer_sizes[i+1]
            new_layer = self.layer(size_in,size_out)
            new_layer.init_W(self.init)
            new_layer.init_b()
            self.layers.append(new_layer)
        for epoch in range(self.max_epochs):
            print("epoch",epoch)
            shuffled_indices = np.random.permutation(len(X_train))
            X_shuffled = X_train[shuffled_indices]
            Y_shuffled = Y_train[shuffled_indices]
            
            epoch_cost = 0
            for i in range(0, X_train.shape[0], self.batch_size):
                X_batch = X_shuffled[i:i+self.batch_size,:]
                Y_batch = Y_shuffled[i:i+self.batch_size]
                y_pred_batch = self.forward_pass(X_batch)
                
                batch_cost = np.sum(self.cost_function(y_pred_batch,Y_batch), axis = 0)
                epoch_cost += batch_cost
                
                self.back_prop(y_pred_batch,Y_batch)
            
            epoch_cost /= X_train.shape[0]
            
            if epoch % 5 == 0:
                print(float(epoch_cost))
                self.cost_storage.append(float(epoch_cost))
            
            if len(self.cost_storage) >= 2:
                cost_prev = self.cost_storage[-2]
                cost_curr = self.cost_storage[-1]
                if abs(cost_prev - cost_curr) < self.convergence_level:
                    return
    
    def predict(self, X_test):
        y_pred = self.forward_pass(X_test)
        return np.argmax(y_pred, axis = 1)
            
    def forward_pass(self,X_batch):
        X_batch = X_batch.T
        A_prev = X_batch
        for i in range(len(self.layers)):
            layer = self.layers[i]
            W = layer.W
            b = layer.b
            Z = (W @ A_prev) + b
            A = Z
            if i != len(self.layers) - 1:
                A = self.activation(Z)
            self.cache[f"layer_{i+1}"] = {"A": A, "Z": Z, "A_prev": A_prev}
            A_prev = A
        y_pred = A.T
        return y_pred

    def back_prop(self, y_pred, Y):
        batch_size = Y.shape[0]
        
        l_grad = cost_grad[self.cost_function]
        a_grad = activation_grad[self.activation]
        
        for i in range(len(self.layers)):
            index = len(self.layers)- i
            Z = self.cache[f"layer_{index}"]['Z']
            A_prev = self.cache[f"layer_{index}"]["A_prev"]
            layer = self.layers[index-1]
            
            if i == 0:
                loss = l_grad(y_pred, Y).T
                dZ = (loss) * a_grad(Z)
            else:
                dZ = (W_prev.T @ dZ) * a_grad(Z)
                
            W, b = layer.W, layer.b
            W_prev = W
            
            dW = (dZ @ A_prev.T) / batch_size
            db = np.sum(dZ, axis=1) / batch_size
            db = db.reshape(db.shape[0],1)      
            
            W -= self.learning_rate * dW
            b -= self.learning_rate * db
            layer.W, layer.b = W, b
            
    def plot_cost(self):
        plt.plot(self.cost_storage)
        plt.xlabel('Epoch')
        plt.ylabel('Cost')
        plt.title('Cost over Epochs')
        plt.grid(True)
        plt.show()
            
    class layer:
        def __init__(self,n_in,n_out):
            self.W = None
            self.b = None
            self.n_in = n_in
            self.n_out = n_out
        
        def init_W(self, init_method):
            self.W = init_method(self.n_out,self.n_in)       
            
        def init_b(self):
            self.b = np.zeros((self.n_out,1))    

test

In [15]:
df_train = pd.read_csv("mnist_train.csv")
df_test = pd.read_csv("mnist_test.csv")
X_train, Y_train = df_train.drop(columns='label'), df_train['label']
X_test, Y_test = df_test.drop(columns='label'), df_test['label']

X_train, Y_train = X_train.to_numpy(), Y_train.to_numpy()
X_test, Y_test = X_test.to_numpy(), Y_test.to_numpy()


my_MLP = MLP(max_epochs = 5, learning_rate = .0001)
layers = [784, 512, 256, 10]
my_MLP.fit(X_train, Y_train, layers)

print(my_MLP.cost_storage)

y_pred = my_MLP.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

my_MLP.plot_cost

5
epoch 0
0.07043457093766678
epoch 1
epoch 2
epoch 3
epoch 4
[0.07043457093766678]
Accuracy: 0.90


<bound method MLP.plot_cost of <__main__.MLP object at 0x11b58d590>>

PyTorch

In [16]:
import torch.nn as nn
import torch.optim as optim

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Reshape and normalize images, convert labels to tensors
X_train = torch.tensor(X_train.reshape(-1, 28*28) / 255.0, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.long)
X_test = torch.tensor(X_test.reshape(-1, 28*28) / 255.0, dtype=torch.float32)
Y_test = torch.tensor(Y_test, dtype=torch.long)

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the MLP Model
class MLP(nn.Module):
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.fc3 = nn.Linear(hidden2_size, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Hyperparameters
input_size = 28 * 28
hidden1_size = 512
hidden2_size = 256
output_size = 10
learning_rate = 0.001
num_epochs = 5

# Initialize model, loss function, and optimizer
model = MLP(input_size, hidden1_size, hidden2_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_idx, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if (batch_idx + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {running_loss/100:.4f}')
            running_loss = 0.0

print('Finished Training')

# Testing the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy of the model on the 10,000 test images: {accuracy:.2f}%')


Epoch [1/5], Step [100/938], Loss: 0.6926
Epoch [1/5], Step [200/938], Loss: 0.2916
Epoch [1/5], Step [300/938], Loss: 0.2346
Epoch [1/5], Step [400/938], Loss: 0.2016
Epoch [1/5], Step [500/938], Loss: 0.1916
Epoch [1/5], Step [600/938], Loss: 0.1443
Epoch [1/5], Step [700/938], Loss: 0.1509
Epoch [1/5], Step [800/938], Loss: 0.1278
Epoch [1/5], Step [900/938], Loss: 0.1237
Epoch [2/5], Step [100/938], Loss: 0.0903
Epoch [2/5], Step [200/938], Loss: 0.0960
Epoch [2/5], Step [300/938], Loss: 0.0891
Epoch [2/5], Step [400/938], Loss: 0.0809
Epoch [2/5], Step [500/938], Loss: 0.0919
Epoch [2/5], Step [600/938], Loss: 0.0767
Epoch [2/5], Step [700/938], Loss: 0.0892
Epoch [2/5], Step [800/938], Loss: 0.0854
Epoch [2/5], Step [900/938], Loss: 0.0828
Epoch [3/5], Step [100/938], Loss: 0.0484
Epoch [3/5], Step [200/938], Loss: 0.0614
Epoch [3/5], Step [300/938], Loss: 0.0655
Epoch [3/5], Step [400/938], Loss: 0.0527
Epoch [3/5], Step [500/938], Loss: 0.0547
Epoch [3/5], Step [600/938], Loss: