In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [171]:
def generate_temp_data(n):
    farenheit = np.random.rand(n).reshape(n,1)
    celcius = (farenheit-32) * (5/9)
    return farenheit, celcius

X, Y = generate_temp_data(5000)
print(X[0:10])
print(Y[0:10])

[[0.31597654]
 [0.27385159]
 [0.65352828]
 [0.98682435]
 [0.8852986 ]
 [0.09748676]
 [0.145739  ]
 [0.78769725]
 [0.83536539]
 [0.9619359 ]]
[[-17.60223525]
 [-17.62563801]
 [-17.41470651]
 [-17.22954203]
 [-17.28594522]
 [-17.72361847]
 [-17.69681167]
 [-17.34016819]
 [-17.31368589]
 [-17.24336895]]


In [172]:
class FFNN:
    def __init__(self, layer_sizes, 
                 activation_fn, activation_fn_deriv, 
                 output_activation_fn, output_activation_fn_deriv, 
                 loss_fn, loss_fn_deriv, task_type='classification', lambda_val = 0):
        
        self.num_layers = len(layer_sizes)
        self.layer_sizes = layer_sizes
        self.activation_fn = activation_fn
        self.activation_fn_deriv = activation_fn_deriv
        self.output_activation_fn = output_activation_fn
        self.output_activation_fn_deriv = output_activation_fn_deriv
        self.loss_fn = loss_fn
        self.loss_fn_deriv = loss_fn_deriv
        self.task_type = task_type
        self.lambda_val = lambda_val
        
        # Initialize weights and biases (He initialization)
        self.W = [np.random.randn(layer_sizes[i+1], layer_sizes[i]) * np.sqrt(2./layer_sizes[i]) 
                  for i in range(self.num_layers-1)]
        self.b = [np.random.randn(layer_sizes[i+1], 1)*0 for i in range(self.num_layers-1)]


    def forward(self, x):
        a = [x]
        z = []
        
        # Hidden layers
        for i in range(self.num_layers-2):
            z_i = self.W[i].dot(a[i]) + self.b[i]
            a_i = self.activation_fn(z_i)
            z.append(z_i)
            a.append(a_i)
        
        # Output layer
        z_out = self.W[-1].dot(a[-1]) + self.b[-1]
        a_out = self.output_activation_fn(z_out)
        z.append(z_out)
        a.append(a_out)
        
        return a_out, a, z
    
    def compute_loss(self, predictions, t):
        # Ensure the target shape matches the predictions
        if t.shape[0] != predictions.shape[0]:
            t = t.T

        # Original loss without regularization
        loss_without_reg = self.loss_fn(predictions, t)

        # L1 regularization penalty
        l1_penalty = sum([abs(w).sum() for w in self.W])

        # Total loss with regularization
        total_loss = loss_without_reg + self.lambda_val * l1_penalty

        return total_loss
    
    def _one_hot(self, inputs):
        n_values = len(np.unique(inputs))
        return np.eye(n_values)[:,inputs]
    
    def backward(self, x, t, a, z):
        m = x.shape[1]
        dL_dw = [np.zeros_like(w) for w in self.W]
        dL_db = [np.zeros_like(b) for b in self.b]
        
        # Ensure the target shape matches the predictions
        if t.shape[0] != a[-1].shape[0]:
            t = t.T
        
        dL_da = self.loss_fn_deriv(a[-1], t)
        dL_dz = dL_da * self.output_activation_fn_deriv(z[-1])
        
        dL_dw[-1] = dL_dz.dot(a[-2].T)
        dL_db[-1] = np.sum(dL_dz, axis=1, keepdims=True)
        
        for i in range(self.num_layers-3, -1, -1):
            dL_da = self.W[i+1].T.dot(dL_dz)
            dL_dz = dL_da * self.activation_fn_deriv(z[i])
            dL_dw[i] = dL_dz.dot(a[i].T)
            dL_db[i] = np.sum(dL_dz, axis=1, keepdims=True)
        
        return dL_dw, dL_db
    
    def update_weights(self, dL_dw, dL_db, learning_rate): #with L1 regularization
        # Adjust the gradient for L1 regularization
        dL_dw_regularized = [dw + self.lambda_val * np.sign(w) for w, dw in zip(self.W, dL_dw)]

        # Update weights and biases using the adjusted gradient
        self.W = [w - learning_rate * dw_reg for w, dw_reg in zip(self.W, dL_dw_regularized)]
        self.b = [b - learning_rate * db for b, db in zip(self.b, dL_db)]
        
    def train(self, X, labels, learning_rate=0.01, epochs=1000, batch_size=None):
        """Trains the neural network using the given training data and labels."""
        m = X.shape[1]

        for epoch in range(epochs):
            if batch_size:  # If batch size is specified, use mini-batch gradient descent
                num_batches = m // batch_size
                for batch in range(num_batches):
                    X_batch = X[:, batch*batch_size:(batch+1)*batch_size]
                    labels_batch = labels[batch*batch_size:(batch+1)*batch_size]

                    # Forward pass
                    predictions, a_batch, z_batch = self.forward(X_batch)
                    
                    # Backward pass
                    dL_dw, dL_db = self.backward(X_batch, labels_batch, a_batch, z_batch)

                    # Update weights and biases
                    self.update_weights(dL_dw, dL_db, learning_rate)
            else:  # Otherwise, use batch gradient descent
                # Forward pass
                predictions, a_full, z_full = self.forward(X)
                
                # Backward pass
                dL_dw, dL_db = self.backward(X, labels, a_full, z_full)

                # Update weights and biases
                self.update_weights(dL_dw, dL_db, learning_rate)

            # Print loss at the end of each 100 epochs:
            if epoch % 100 == 0:
                # Ensure activations are for the entire dataset
                predictions, _, _ = self.forward(X)
                loss = self.compute_loss(predictions, labels)
                print(f"Epoch {epoch+1}/{epochs} - Loss: {np.mean(loss)}")

In [173]:
''' Helper functions '''

def ReLU(inputs):
    return np.maximum(inputs, 0)
def deriv_ReLU(Z):
    return Z>0

def Linear(inputs):
    return inputs

def deriv_Linear(inputs):
    return inputs*0 + 1

def L2(outputs, targets):
    return 0.5*sum((outputs.flatten() - targets.flatten())**2)

def deriv_L2(outputs, targets):
    return outputs - targets

def one_hot(inputs):
    n_values = np.max(inputs) + 1
    return np.eye(n_values)[:,inputs]

In [174]:
layer_sizes = [1,2,1]

ffnn = FFNN(layer_sizes = layer_sizes, 
                activation_fn = Linear, 
                activation_fn_deriv = deriv_Linear, 
                output_activation_fn = Linear, 
                output_activation_fn_deriv = deriv_Linear, 
                loss_fn = L2, 
                loss_fn_deriv = deriv_L2,
                task_type='regression',
                lambda_val=0.001)

ffnn.train(X.T, Y, learning_rate=0.000001, epochs = 8000, batch_size=700)

Epoch 1/20000 - Loss: 693194.3364847661
Epoch 101/20000 - Loss: 10617.22514558033
Epoch 201/20000 - Loss: 3668.0836079598935
Epoch 301/20000 - Loss: 1166.7492341661632
Epoch 401/20000 - Loss: 343.3090315443343
Epoch 501/20000 - Loss: 95.74325269306856
Epoch 601/20000 - Loss: 25.86441559559123
Epoch 701/20000 - Loss: 6.868780831522536
Epoch 801/20000 - Loss: 1.8113914478503192
Epoch 901/20000 - Loss: 0.47971768537575515
Epoch 1001/20000 - Loss: 0.13108179614017662
Epoch 1101/20000 - Loss: 0.04007646058116789
Epoch 1201/20000 - Loss: 0.016356096892773724
Epoch 1301/20000 - Loss: 0.010177645171066271
Epoch 1401/20000 - Loss: 0.008568674287240279
Epoch 1501/20000 - Loss: 0.008149599250340568
Epoch 1601/20000 - Loss: 0.008040376737096877
Epoch 1701/20000 - Loss: 0.008011870048017919
Epoch 1801/20000 - Loss: 0.008004408252624627
Epoch 1901/20000 - Loss: 0.008002443432691552
Epoch 2001/20000 - Loss: 0.008001919580181073
Epoch 2101/20000 - Loss: 0.008001776083232534
Epoch 2201/20000 - Loss: 0.

Epoch 17801/20000 - Loss: 0.008001387078006949
Epoch 17901/20000 - Loss: 0.008001384967457663
Epoch 18001/20000 - Loss: 0.008001382856908514
Epoch 18101/20000 - Loss: 0.008001380746359654
Epoch 18201/20000 - Loss: 0.008001378635810899
Epoch 18301/20000 - Loss: 0.008001376525262368
Epoch 18401/20000 - Loss: 0.008001374414714104
Epoch 18501/20000 - Loss: 0.008001372304165994
Epoch 18601/20000 - Loss: 0.00800137019361801
Epoch 18701/20000 - Loss: 0.008001368083070236
Epoch 18801/20000 - Loss: 0.008001365972522741
Epoch 18901/20000 - Loss: 0.008001363861975388
Epoch 19001/20000 - Loss: 0.008001361751428187
Epoch 19101/20000 - Loss: 0.008001359640881178
Epoch 19201/20000 - Loss: 0.008001357530334442
Epoch 19301/20000 - Loss: 0.008001355419787826
Epoch 19401/20000 - Loss: 0.00800135330924131
Epoch 19501/20000 - Loss: 0.008001351198694991
Epoch 19601/20000 - Loss: 0.008001349088148915
Epoch 19701/20000 - Loss: 0.00800134697760296
Epoch 19801/20000 - Loss: 0.008001344867057262
Epoch 19901/2000

In [175]:
'''check the conversion capabilities'''
cel_real = (X.T-32) * (5/9)
cel_pred,_,_ = ffnn.forward(X.T)

print(L2(cel_pred.T, cel_real.T))

1.1486846122167238e-09


In [176]:
''' print weights and biases to begin grokking the network '''
print(ffnn.W)
print(ffnn.b)
w11, w12, w21, w22 = (ffnn.W[0][0][0], ffnn.W[0][1][0], ffnn.W[1][0][0], ffnn.W[1][0][1])
bh1, bh2, bo = (ffnn.b[0][0][0], ffnn.b[0][1][0], ffnn.b[1][0][0])


[array([[1.24669922],
       [1.05507221]]), array([[-2.37117486,  3.32839323]])]
[array([[ 2.75109627],
       [-2.90859269]]), array([[-1.57350588]])]


In [177]:
def learned_conversion(f):
    h1 = w11 * f + bh1
    h2 = w12 * f + bh2
    o = w21 * h1 + w22 * h2 + bo
    return o

temps = [-5, 0, 5, 32]
for t in temps:
    print(learned_conversion(t))

-20.555543161614352
-17.777776424618622
-15.000009687622898
-6.930784598901951e-05


In [178]:
'''learned_conversion is equivalent to the following:
(w21*w11 + w22*w12)*f + (w21*bh1 + w22*bh2 + bo) '''

print("learned slope = ", (w21*w11 + w22*w12))
print("learned intercept = ", (w21*bh1 + w22*bh2 + bo))

'''actual formula is (f - 32) * 5/9 
aka, (5/9)*f - 32*(5/9)'''

print("actual slope = ", 5/9)
print("actual intercept = ", (5/9)*-32)

learned slope =  0.5555533473991447
learned intercept =  -17.777776424618622
actual slope =  0.5555555555555556
actual intercept =  -17.77777777777778


In [284]:
''' as a study of L1 regularization, attenpt to use a single network to 
    differentiate between different equations '''

def expr1(x, n1, n2, n3, n4):
    return n1*x + n2, (n1, n2)

def expr2(x, n1, n2, n3, n4):
    return n1*x**2 + n2*x + n3, (n1, n2, n3)

def expr3(x, n1, n2, n3, n4):
    return n1*np.sin(x) + n2*x**2, (n1, n2)

def expr4(x, n1, n2, n3, n4):
    return n1*np.sin(x) + n2*x**2 + n3*x + n4, (n1, n2, n3, n4)

def generate_data(n, expression):
    X = np.random.uniform(low=-1.5, high=1.5, size = n)
    n1, n2, n3, n4 = np.random.randn(4)
    Y, params = expression(X, n1, n2, n3, n4)
    return X, Y, params

X, Y, params = generate_data(5000, expr3)

print(params)

square = X**2
sin = np.sin(X)


'''add in extra features to capture squares and sins'''
X = np.c_[X, square, sin]

(0.20714478025367813, -0.10560242967165585)


In [285]:
'''train model'''
layer_sizes = [3, 1, 1]
ffnn2 = FFNN(layer_sizes = layer_sizes, 
                activation_fn = Linear, 
                activation_fn_deriv = deriv_Linear, 
                output_activation_fn = Linear, 
                output_activation_fn_deriv = deriv_Linear, 
                loss_fn = L2, 
                loss_fn_deriv = deriv_L2,
                task_type='regression',
                lambda_val=0.2)
ffnn2.train(X.T, Y, learning_rate=0.00005, epochs = 12000, batch_size=700)

Epoch 1/12000 - Loss: 487.0904087675701
Epoch 101/12000 - Loss: 0.7825122745583735
Epoch 201/12000 - Loss: 0.31476867191643954
Epoch 301/12000 - Loss: 0.2770520559691102
Epoch 401/12000 - Loss: 0.2725081501388098
Epoch 501/12000 - Loss: 0.2705915149248239
Epoch 601/12000 - Loss: 0.26883076088705266
Epoch 701/12000 - Loss: 0.26711613918586025
Epoch 801/12000 - Loss: 0.26547573043488837
Epoch 901/12000 - Loss: 0.26392239072971985
Epoch 1001/12000 - Loss: 0.26245860745689936
Epoch 1101/12000 - Loss: 0.2610830322065726
Epoch 1201/12000 - Loss: 0.2597929616752977
Epoch 1301/12000 - Loss: 0.2584540009668695
Epoch 1401/12000 - Loss: 0.2577559558890432
Epoch 1501/12000 - Loss: 0.2570022425594326
Epoch 1601/12000 - Loss: 0.25618406293340895
Epoch 1701/12000 - Loss: 0.25535422807902886
Epoch 1801/12000 - Loss: 0.25454134597188904
Epoch 1901/12000 - Loss: 0.2537392459579349
Epoch 2001/12000 - Loss: 0.2529478827183936
Epoch 2101/12000 - Loss: 0.25218972064411627
Epoch 2201/12000 - Loss: 0.25142898

In [286]:
'''check conversion capabilities (expr 1) '''
args = (*params, 0, 0, 0)[:4]
real, _ = expr3(X[:,0], *args)

pred,_,_ = ffnn2.forward(X.T)

print(L2(pred, real))

0.0032652086556610653


In [287]:
print(ffnn2.W)
print(ffnn2.b)
# threshold = 1e-4  # for example

# for idx, w in enumerate(ffnn2.W):
#     ffnn2.W[idx] = np.where(abs(w) < threshold, 0, w)

[array([[-0.00646479,  0.20738773, -0.40131432]]), array([[-0.50332773]])]
[array([[-0.05760433]]), array([[-0.02989564]])]


In [245]:
learned_param1 = ffnn2.W[0][0][0]*ffnn2.W[1][0][0]
learned_param2 = ffnn2.b[0][0][0]*ffnn2.W[1][0][0] + ffnn2.b[1][0][0]

print(params)
print(learned_param1, learned_param2)

(-0.1463186304698898, -0.7180350821061807, 0.8901555607951912)
0.35199804019794356 -1.389567191087169


In [244]:
def generate_data(n, expression):
    X = np.random.uniform(low=0.0, high=1.5, size = n)
    n1, n2, n3, n4 = np.random.randn(4)
    Y, params = expression(X, n1, n2, n3, n4)
    return X, Y, params
X, Y, params = generate_data(5000, expr2)
square = X**2
sin = np.sin(X)

'''add in extra features to capture squares and sins'''
X = np.c_[X, square, sin]