In [None]:
!pip install ucimlrepo #for specific dataset
import numpy as np
import random

In [None]:
#assume that you're dealing with a 28x28 pixel image
#try using this as your dataset: https://www.nist.gov/srd/nist-special-database-19
#just code the skeleton first and then handle how you'll deal with the data.
#might have to work with letters instead of numbers since lecun took down the mnist dataset with handwritten digits... :(

#make sure you use classes and create test functions which test the code

#step 1: code the skeleton and see if it spits out random numbers correctly. this part is super-duper easy; include an activation function (ReLU or sigmoid squishification)

#step 2: implement gradient descent via backpropagation

#step 3: figure out how to interface data with the algorithm

#step 4: train the network by dividing data up into mini-batches (stochastic gradient descent)

#ok so coded the training function. next step is conditioning the data in the format that i specified in the training function


#training on the iris data set for simplicity.
#Fisher, R. (1936). Iris [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C56C76.


In [None]:
class myNN():
  def __init__(self, cfg):
      self.cfg = cfg
      self.hidden = len(cfg) - 2 #columns in the cfg row vector - 2, which gives the number of hidden layers
      self.input_dim = cfg[0]
      self.output_dim = cfg[-1]
      self.weight = [np.random.rand(y,x) for x,y in zip(cfg[:-1], cfg[1:])] #(y,x) since we want the first input to be the number of columns (inputs) and second input to the number of rows (outputs)
      self.bias = [np.random.rand(x,1) for x in cfg[1:]] #for now, each node has a bias, but i may modify that later; can use conditionals to modify

  def mod_weight(self, m: list, p: float):
    if not isinstance(m, list):
        raise TypeError("m must be a list!")
    elif not isinstance(p, float):
        raise TypeError("p must be a float!")
    else:
        self.weight = [self.weight[i]+p*x for i, x in enumerate(m)]
  def mod_bias(self, b: list, p: float):
    if not isinstance(b, list):
        raise TypeError("b must be a list!")
    elif not isinstance(p, float):
        raise TypeError("p must be a float!")
    else:
        self.bias = [self.bias[i]+p*x for i, x in enumerate(b)]

  def activations(self, activ: np.array): #given input activations, computes the sigmoid/dsigmoid function at each node (other than the input neurons of course)
    if activ.size != self.input_dim:
      raise ValueError(f"inconsistent input dimension! got {activ.size}, expected {(self.input_dim,1)}")
    else:
      def sigmoid(x):
        return 1 / (1 + np.exp(-x))
      def dsigmoid(x): #derivative of the sigmoid function
        return np.exp(-x) / (1 + np.exp(-x))**2
      firing = [activ] #activations - beginning with the input layer
      change = [activ] #derivative of the the sigmoid function
      for k in range(len(self.weight)): #excludes the first one because that's how we initiated
        firing.append(sigmoid(self.weight[k] @ firing[k] + self.bias[k]))
        change.append(dsigmoid(self.weight[k] @ firing[k] + self.bias[k]))
      return firing, change

  def backpropagate(self, activ: np.array, outcomes: np.array): #given activations and desired outcomes, computes the gradient of the cost function
    if activ.size != self.input_dim:
      raise ValueError(f"inconsistent input dimension! got {(activ.size,1)}, expected {(self.input_dim,1)}")
    elif outcomes.size != self.output_dim:
      raise ValueError(f"inconsistent input dimension! got {(activ.size,1)}, expected {(self.output_dim,1)}")
    else:
      firing, change = self.activations(activ)
      #output layer
      outer = 2*(firing[-1] - outcomes)
      a0 = change[-1] #-1 index in change depends on the values in -2 of firing and so on...
      f0 = firing[-2]
      dC1 = outer*a0 #so this is a column vector
      dC2 = outer*a0
      dCw = []
      dCb = []
      #for loop including the input layer (ie, start with the layer closest to the output layer and then works backward)
      for k in reversed(range(self.hidden + 1)):
        #activations are rows; dC is columns
        dCw.append(dC1 @ f0.T) #this feeds f0 from the previous pass
        dCb.append(dC2)
        a0 = change[k] #on the first pass, change[-2]; on the last pass, k=0. but this doesn't matter since it's not appended
        f0 = firing[k-1] #on the first pass, firing[-3]
        dC1 = (self.weight[k].T @ dC1)*a0 #since self.weight has a length which is one shorter than firing, k starts with the last element of weights
        dC2 = sum(dC2)*a0
      return dCw[::-1], dCb[::-1] #reversing since first entry corresponds to output

  def new(self, activ: np.array, outcomes: np.array, dxw: float, dxb): #given some discretization step, outputs new weights and biases; importantly, takes the NEGATIVE of the gradient computed in the backpropagation method
    if not isinstance(dxw, float):
      raise TypeError("descritization dxm must be a float!")
    elif not isinstance(dxb, float):
      raise TypeError("descritization dxb must be a float!")
    else:
      w, b = self.backpropagate(activ, outcomes)
      self.weight = [self.weight[i]-dxw*x for i,x in enumerate(w)]
      self.bias = [self.bias[i]-dxb*x for i,x in enumerate(b)]

  def run(self, activ: np.array, outcomes: np.array):  #given activations and outcomes, runs the neural network; a bit redundant, given that we have a whole activation function...
    if outcomes.size != self.output_dim:
      raise ValueError(f"inconsistent input dimension! got {(outcomes.size,1)}, expected {(self.output_dim,1)}")
    else:
      activation, _ = self.activations(activ)
      return activation[-1], sum((activation[-1] - outcomes)**2) #gives the cost function output as well

#batched training function
#while loop which stops when the cost function is below some value and the change in the cost function is below some value
#have a condition in the while loop to initialize it in new position if gradient is below the value but the cost function isn't below the other value
#takes in data, batches it, takes the average of the negative gradients, modifies the weights and biases using the mod_weight and mod_bias methods in the neural network class

def training(NN_structure, dataset, answer_key, batch_size, batch_number, C_min, dC_min, dxw, dxb):
#answer_key: list of vectors, where each (column) vector corresponds to the expected values for one example
#dataset: list of vectors, where each (column) vector holds the input activations for one example
  network = myNN(NN_structure)
  #function for testing the accuracy of the algorithm
  def test_acc(): #how it works is that it takes the output vector, maps it to its nearest standard basis vector (using round now for simplicity), and then computes the ratio of number of correct answers to the total number of points in the dataset
    L = len(dataset)
    num_correct = 0 #number of correct answers
    for k in range(L):
      output,_ = network.run(dataset[k], answer_key[k]) #answer_key has no relevance here - it's used in the run method to compute the cost (whose output is muted here)
      if (answer_key[k] == np.round(output)).all():
        num_correct += 1
      else:
        continue
    print(f"accuracy: {num_correct*100/L}%")
    return num_correct*100/L
  #running the test before training
  before = test_acc()

  #training
  cost = C_min + 1
  cost_change = dC_min + 1
  i = 0
  while cost > C_min:
    avg_dCw = 0
    avg_dCb = 0
    for k in range(batch_size):
      cost_change = cost
      inputs = dataset[k+i]
      desired_outputs = answer_key[k+i]
      dCw, dCb = network.backpropagate(inputs, desired_outputs)
      if isinstance(avg_dCw, int):
        avg_dCw = dCw
        avg_dCb = dCb
      else:
        avg_dCw = [avg_dCw[i]+x for i,x in enumerate(dCw)]
        avg_dCb = [avg_dCb[i]+x for i,x in enumerate(dCb)]
    avg_dCw = [x/batch_size for x in avg_dCw] #taking the average
    avg_dCb = [x/batch_size for x in avg_dCb]
    network.mod_weight(avg_dCw, -dxw) #taking negative since we need negative of the gradient
    network.mod_bias(avg_dCb, -dxb)
    _, cost = network.run(inputs, desired_outputs) #updating the cost
    cost_change = abs(cost - cost_change) #ie, new_cost - old_cost
    i += 1
    if i > batch_number: #stops if we reached the max number of times we get to iterate
      if cost_change > dC_min:
        network = myNN(NN_structure) #ie, reinitializing. idk if this is a good idea or not lol
        i = 0
        avg_dCw = 0
        avg_dCb = 0
        cost_change = dC_min + 1
        cost = C_min + 1
        continue
      else:
        break
    elif cost_change < dC_min:
      if cost > C_min:
        network = myNN(NN_structure) #ie, reinitializing. idk if this is a good idea or not lol
        i = 0
        avg_dCw = 0
        avg_dCb = 0
        cost_change = dC_min + 1
        cost = C_min + 1
        continue
      else:
        break
    else:
      continue

  print(f"finished training after {i} batches")

  #running the test after training
  after = test_acc()
  return network, cost, cost_change, before, after

#write a testing function which tests how well the algorithm does. may need to take everything into a class because i'll need to keep track of the first
#instance of the neural network and compare it to the instance after it has been trained...
#or i can just write it as part of the training and testing function... because that has one defined 'network' that we can work with

implementing training of the specific iris dataset

In [None]:
#pulling the dataset
from ucimlrepo import fetch_ucirepo

# fetch dataset
iris = fetch_ucirepo(id=53)

# data (as pandas dataframes)
X = iris.data.features #ie, these are the attributes based on which we will be determining the flower
y = iris.data.targets #these are the answers

# metadata
#print(iris.metadata)

# variable information
#print(iris.variables)

In [None]:
#conditioning the data
data = X.to_numpy()
data = [row.reshape(-1, 1) for row in data] #making every row a column
num = 3
vecs = np.eye(num) #something is up with forcing the elements of data and ans_to_vec, etc, to be column vectors...
answers = y.to_numpy()
ans_to_vec = [vecs[:,0].reshape(num,1) if x=='Iris-setosa' else vecs[:,1].reshape(num,1) if x=='Iris-versicolor' else vecs[:,2].reshape(num,1) for x in answers]

#batching them together so that we can mix them (then separate them after)
mixer = [(data[i],x) for i,x in enumerate(ans_to_vec)]
random.shuffle(mixer)

#now separating them (keeping the mixed set because it will be useful for later)
dataset = [x[0] for x in mixer]
answer_key = [x[1] for x in mixer]
NN_structure = [dataset[0].size, 4, 4, answer_key[0].size]
batch_size = 3
batch_number = 30
dxw = 0.7
dxb = 0.7
C_min = 0.1
dC_min = 0.1

#running the training function
network, cost, cost_change, before, after = training(NN_structure, dataset, answer_key, batch_size, batch_number, C_min, dC_min, dxw, dxb)

accuracy: 0.0%
finished training after 5 batches
accuracy: 33.333333333333336%
