In [52]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from PIL import Image

In [103]:
def generate_temp_data(n):
    farenheit = np.random.rand(n).reshape(n,1)
    celcius = (farenheit-32) * (5/9)
    return farenheit, celcius

X, Y = generate_temp_data(5000)
print(X[0:10])
print(Y[0:10])

[[0.4607624 ]
 [0.10815563]
 [0.5025832 ]
 [0.25980607]
 [0.82426736]
 [0.21169986]
 [0.25958015]
 [0.90772114]
 [0.98437963]
 [0.31629536]]
[[-17.52179867]
 [-17.71769132]
 [-17.49856489]
 [-17.63344107]
 [-17.31985147]
 [-17.66016675]
 [-17.63356658]
 [-17.27348826]
 [-17.23090021]
 [-17.60205813]]


In [3]:
def generate_class_data(n):
    dataset = []
    labels = []
    X = np.random.uniform(0,1,(n,2))
    for point in X:
        if point[1] < 0.5 and point[1] < (-1/0.5)*point[0] + 1:
            labels.append(0)
        else:
            labels.append(1)
            
    return X, np.array(labels).astype('int')

#X, labels = generate_class_data(5000)

In [68]:
class FFNN:
    def __init__(self, layer_sizes, 
                 activation_fn, activation_fn_deriv, 
                 output_activation_fn, output_activation_fn_deriv, 
                 loss_fn, loss_fn_deriv, task_type='classification', lambda_val = 0):
        
        self.num_layers = len(layer_sizes)
        self.layer_sizes = layer_sizes
        self.activation_fn = activation_fn
        self.activation_fn_deriv = activation_fn_deriv
        self.output_activation_fn = output_activation_fn
        self.output_activation_fn_deriv = output_activation_fn_deriv
        self.loss_fn = loss_fn
        self.loss_fn_deriv = loss_fn_deriv
        self.task_type = task_type
        self.lambda_val = lambda_val
        
        # Initialize weights and biases (He initialization)
        self.W = [np.random.randn(layer_sizes[i+1], layer_sizes[i]) * np.sqrt(2./layer_sizes[i]) 
                  for i in range(self.num_layers-1)]
        self.b = [np.random.randn(layer_sizes[i+1], 1)*0 for i in range(self.num_layers-1)]


    def forward(self, x):
        a = [x]
        z = []
        
        # Hidden layers
        for i in range(self.num_layers-2):
            z_i = self.W[i].dot(a[i]) + self.b[i]
            a_i = self.activation_fn(z_i)
            z.append(z_i)
            a.append(a_i)
        
        # Output layer
        z_out = self.W[-1].dot(a[-1]) + self.b[-1]
        a_out = self.output_activation_fn(z_out)
        z.append(z_out)
        a.append(a_out)
        
        return a_out, a, z
    
    def compute_loss(self, predictions, t):
        # Ensure the target shape matches the predictions
        if t.shape[0] != predictions.shape[0]:
            t = t.T

        # Original loss without regularization
        loss_without_reg = self.loss_fn(predictions, t)

        # L1 regularization penalty
        l1_penalty = sum([abs(w).sum() for w in self.W])

        # Total loss with regularization
        total_loss = loss_without_reg + self.lambda_val * l1_penalty

        return total_loss
    
    def _one_hot(self, inputs):
        n_values = len(np.unique(inputs))
        return np.eye(n_values)[:,inputs]
    
    def backward(self, x, t, a, z):
        m = x.shape[1]
        dL_dw = [np.zeros_like(w) for w in self.W]
        dL_db = [np.zeros_like(b) for b in self.b]
        
        # Ensure the target shape matches the predictions
        if t.shape[0] != a[-1].shape[0]:
            t = t.T
        
        dL_da = self.loss_fn_deriv(a[-1], t)
        dL_dz = dL_da * self.output_activation_fn_deriv(z[-1])
        
        dL_dw[-1] = dL_dz.dot(a[-2].T)
        dL_db[-1] = np.sum(dL_dz, axis=1, keepdims=True)
        
        for i in range(self.num_layers-3, -1, -1):
            dL_da = self.W[i+1].T.dot(dL_dz)
            dL_dz = dL_da * self.activation_fn_deriv(z[i])
            dL_dw[i] = dL_dz.dot(a[i].T)
            dL_db[i] = np.sum(dL_dz, axis=1, keepdims=True)
        
        return dL_dw, dL_db
    
    def update_weights(self, dL_dw, dL_db, learning_rate): #with L1 regularization
        # Adjust the gradient for L1 regularization
        dL_dw_regularized = [dw + self.lambda_val * np.sign(w) for w, dw in zip(self.W, dL_dw)]

        # Update weights and biases using the adjusted gradient
        self.W = [w - learning_rate * dw_reg for w, dw_reg in zip(self.W, dL_dw_regularized)]
        self.b = [b - learning_rate * db for b, db in zip(self.b, dL_db)]
        
    def train(self, X, labels, learning_rate=0.01, epochs=1000, batch_size=None):
        """Trains the neural network using the given training data and labels."""
        m = X.shape[1]

        for epoch in range(epochs):
            if batch_size:  # If batch size is specified, use mini-batch gradient descent
                num_batches = m // batch_size
                for batch in range(num_batches):
                    X_batch = X[:, batch*batch_size:(batch+1)*batch_size]
                    labels_batch = labels[batch*batch_size:(batch+1)*batch_size]

                    # Forward pass
                    predictions, a_batch, z_batch = self.forward(X_batch)
                    
                    # Backward pass
                    dL_dw, dL_db = self.backward(X_batch, labels_batch, a_batch, z_batch)

                    # Update weights and biases
                    self.update_weights(dL_dw, dL_db, learning_rate)
            else:  # Otherwise, use batch gradient descent
                # Forward pass
                predictions, a_full, z_full = self.forward(X)
                
                # Backward pass
                dL_dw, dL_db = self.backward(X, labels, a_full, z_full)

                # Update weights and biases
                self.update_weights(dL_dw, dL_db, learning_rate)

            # Print loss at the end of each 100 epochs:
            if epoch % 100 == 0:
                # Ensure activations are for the entire dataset
                predictions, _, _ = self.forward(X)
                loss = self.compute_loss(predictions, labels)
                print(f"Epoch {epoch+1}/{epochs} - Loss: {np.mean(loss)}")

In [69]:
def ReLU(inputs):
    return np.maximum(inputs, 0)
def deriv_ReLU(Z):
    return Z>0

def sigmoid(inputs, clip_value=200):
    inputs = np.clip(inputs, -clip_value, clip_value)
    return 1 / (1 + np.exp(-inputs))

def deriv_sigmoid(inputs):
    return(inputs*(1-inputs))

def Linear(inputs):
    return inputs

def deriv_Linear(inputs):
    return inputs*0 + 1

def clipped_ReLU(x, c=1):
    return np.minimum(np.maximum(0, x), c)

def deriv_clipped_ReLU(inputs):
    return (inputs>0) & (inputs<1)

def L2(outputs, targets):
    return 0.5*sum((outputs - targets)**2)

# L1 regularization
def L2_regularized(outputs, targets, weights, lambda_val):
    l1_penalty = sum([abs(w).sum() for w in weights])
    return L2(outputs, targets) + lambda_val * l1_penalty

def deriv_L2(outputs, targets):
    return outputs - targets

def one_hot(inputs):
    n_values = np.max(inputs) + 1
    return np.eye(n_values)[:,inputs]

def softmax(Z):
    shiftZ = Z - np.max(Z, axis=0)
    exps = np.exp(shiftZ)
    return exps / np.sum(exps, axis=0)

def dummy_deriv_softmax(Z):
    return Z

def categorical_cross_entropy(predictions, labels):
    m = labels.shape[1]
    return -np.sum(np.log(predictions) * labels) / m

def deriv_cat_cross_entropy(predictions, labels):
    return predictions - labels  




In [113]:
layer_sizes = [1,2,1]

ffnn = FFNN(layer_sizes = layer_sizes, 
                activation_fn = Linear, 
                activation_fn_deriv = deriv_Linear, 
                output_activation_fn = Linear, 
                output_activation_fn_deriv = deriv_Linear, 
                loss_fn = L2, 
                loss_fn_deriv = deriv_L2,
                task_type='regression',
                lambda_val=0.001)

ffnn.train(X.T, Y, learning_rate=0.000001, epochs = 20000, batch_size=700)

Epoch 1/20000 - Loss: 166.51873817407832
Epoch 101/20000 - Loss: 0.05284816859230499
Epoch 201/20000 - Loss: 0.022261671661424863
Epoch 301/20000 - Loss: 0.011102190067582316
Epoch 401/20000 - Loss: 0.007133921426455541
Epoch 501/20000 - Loss: 0.0057511664761877215
Epoch 601/20000 - Loss: 0.005276911565550089
Epoch 701/20000 - Loss: 0.005116768022213965
Epoch 801/20000 - Loss: 0.005063778810973689
Epoch 901/20000 - Loss: 0.005046813504791723
Epoch 1001/20000 - Loss: 0.005041710526051425
Epoch 1101/20000 - Loss: 0.005040378102920519
Epoch 1201/20000 - Loss: 0.005040164109215569
Epoch 1301/20000 - Loss: 0.0050402318681317315
Epoch 1401/20000 - Loss: 0.005040336944239601
Epoch 1501/20000 - Loss: 0.005040420236203965
Epoch 1601/20000 - Loss: 0.0050404758739295875
Epoch 1701/20000 - Loss: 0.0050405102395024055
Epoch 1801/20000 - Loss: 0.005040530421697264
Epoch 1901/20000 - Loss: 0.005040541713652651
Epoch 2001/20000 - Loss: 0.00504054760308402
Epoch 2101/20000 - Loss: 0.005040550263658572


Epoch 18001/20000 - Loss: 0.005040243058389683
Epoch 18101/20000 - Loss: 0.005040241084991562
Epoch 18201/20000 - Loss: 0.005040239111593907
Epoch 18301/20000 - Loss: 0.005040237138196696
Epoch 18401/20000 - Loss: 0.005040235164799925
Epoch 18501/20000 - Loss: 0.0050402331914036255
Epoch 18601/20000 - Loss: 0.0050402312180078054
Epoch 18701/20000 - Loss: 0.005040229244612472
Epoch 18801/20000 - Loss: 0.0050402272712176
Epoch 18901/20000 - Loss: 0.005040225297823174
Epoch 19001/20000 - Loss: 0.005040223324429211
Epoch 19101/20000 - Loss: 0.005040221351035676
Epoch 19201/20000 - Loss: 0.005040219377642624
Epoch 19301/20000 - Loss: 0.005040217404250051
Epoch 19401/20000 - Loss: 0.005040215430857908
Epoch 19501/20000 - Loss: 0.005040213457466189
Epoch 19601/20000 - Loss: 0.005040211484074948
Epoch 19701/20000 - Loss: 0.005040209510684195
Epoch 19801/20000 - Loss: 0.005040207537293899
Epoch 19901/20000 - Loss: 0.00504020556390405


In [114]:
'''check the conversion capabilities'''
far_set = X[11:21]
# -17.61557342222
cel_set,_,_ = ffnn.forward(far_set.T)
print(far_set.T)
print(cel_set)

[[0.29196784 0.30996519 0.61998636 0.66483295 0.70785362 0.76425623
  0.16574462 0.93456168 0.68778562 0.7586549 ]]
[[-17.61557229 -17.60557385 -17.43334125 -17.40842668 -17.3845265
  -17.35319197 -17.68569574 -17.25857858 -17.39567529 -17.35630379]]


In [115]:
''' print weights and biases to begin grokking the network '''
print(ffnn.W)
print(ffnn.b)
w11, w12, w21, w22 = (ffnn.W[0][0][0], ffnn.W[0][1][0], ffnn.W[1][0][0], ffnn.W[1][0][1])
bh1, bh2, bo = (ffnn.b[0][0][0], ffnn.b[0][1][0], ffnn.b[1][0][0])


[array([[0.22621464],
       [0.40600997]]), array([[ 3.70949574, -0.69848325]])]
[array([[-3.94901042],
       [ 0.59814947]]), array([[-2.7111406]])]


In [117]:
def learned_conversion(f):
    h1 = w11 * f + bh1
    h2 = w12 * f + bh2
    o = w21 * h1 + w22 * h2 + bo
    return o

temps = [-5, 0, 5, 32]
for t in temps:
    print(learned_conversion(t))

-20.555530784630438
-17.777775344444525
-15.000019904258611
-0.00014052725467905702


In [121]:
'''learned_conversion is equivalent to the following:
(w21*w11 + w22*w12)*f + (w21*bh1 + w22*bh2 + bo) '''

print("learned slope = ", (w21*w11 + w22*w12))
print("learned intercept = ", (w21*bh1 + w22*bh2 + bo))

'''actual formula is (f - 32) * 5/9 
aka, (5/9)*f - 32*(5/9)'''

print("actual slope = ", 5/9)
print("actual intercept = ", (5/9)*-32)

learned slope =  0.5555510880371828
learned intercept =  -17.777775344444525
actual slope =  0.5555555555555556
actual intercept =  -17.77777777777778
