In [216]:
import math
import random
random.seed(20251229)

class Value:
  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0
    self.label = label
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op

  def __add__(self, other): # self + other
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')
    
    def _backward():
      self.grad += out.grad
      other.grad += out.grad
    out._backward = _backward
    
    return out
  
  def __mul__(self, other): # self * other
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data * other.data, (self, other), '*')
    
    def _backward():
      self.grad += other.data * out.grad
      other.grad += self.data * out.grad
    out._backward = _backward
    
    return out
  
  def __pow__(self, other): # self ** other
    assert isinstance(other, (int, float))
    out = Value(self.data ** other, (self,), f'**{other}')
    
    def _backward():
      self.grad += other * self.data ** (other - 1) * out.grad
    out._backward = _backward
    
    return out

  def relu(self):
      out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
      
      def _backward():
        self.grad += (out.data > 0) * out.grad
      out._backward = _backward
      
      return out
  
  def tanh(self):
    t = (math.exp(2*self.data) - 1)/(math.exp(2*self.data) + 1)
    out = Value(t, (self,), 'tanh')
    
    def _backward():
      self.grad += (1 - t**2) * out.grad
    out._backward = _backward
    
    return out
  
  def exp(self):
    out = Value(math.exp(self.data), (self,), 'exp')
    
    def _backward():
      self.grad += math.exp(self.data) * out.grad
    out._backward = _backward
    
    return out
  
  def sig(self):
    sig = 1.0 / (1.0 + math.exp(-self.data))
    out = Value(sig, (self,), 'sig')
    
    def _backward():
      self.grad += sig * (1.0 - sig) * out.grad
    out._backward = _backward
    
    return out
  
  def log(self):
    out = Value(math.log(self.data), (self,), 'log')
    
    def _backward():
      self.grad += 1.0 / self.data * out.grad
    out._backward = _backward
    
    return out
  
  def __neg__(self): # -self
      return self * -1
  
  def __radd__(self, other): # other + self
      return self + other
  
  def __sub__(self, other): # self - other
      return self + (-other)
  
  def __rsub__(self, other): # other - self
      return other + (-self)
  
  def __rmul__(self, other): # other * self
      return self * other
  
  def __truediv__(self, other): # self / other
      return self * other**-1
  
  def __rtruediv__(self, other): # other / self
      return other * self**-1

  def backward(self):
      # topological order all of the children in the graph
      topo = []
      visited = set()
      def build_topo(v):
          if v not in visited:
              visited.add(v)
              for child in v._prev:
                  build_topo(child)
              topo.append(v)
      build_topo(self)
      # go one variable at a time and apply the chain rule to get its gradient
      self.grad = 1
      for v in reversed(topo):
          v._backward()

  def __repr__(self):
    if self.label:
      return f"Value(data={self.data}, grad={self.grad}, label={self.label})"
    else:
      return f"Value(data={self.data}, grad={self.grad})"

class Module:
  def zero_grad(self):
    for p in self.parameters():
      p.grad = 0

  def parameters(self):
    return []

class Neuron(Module):
  def __init__(self, nin, nonlin=True):
    self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
    self.b = Value(0)
    self.nonlin = nonlin

  def __call__(self, x):
    act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
    return act.sig() if self.nonlin else act

  def parameters(self):
    return self.w + [self.b]

  def __repr__(self):
    return f"{'Sigmoid' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

class Layer(Module):

  def __init__(self, nin, nout, **kwargs):
    self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

  def __call__(self, x):
    out = [n(x) for n in self.neurons]
    return out[0] if len(out) == 1 else out

  def parameters(self):
    return [p for n in self.neurons for p in n.parameters()]

  def __repr__(self):
    return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

class MLP(Module):

  def __init__(self, nin, nouts):
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

  def __repr__(self):
    return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"

mean_squared_error_loss = lambda ys, ypred: sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
binary_cross_entropy_loss = lambda ys, ypred: sum(-ygt * yout.sig().log() - (1.0 - ygt) * (1.0 - yout.sig()).log() for ygt, yout in zip(ys, ypred)) / len(ys)

In [217]:
xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0]
]
ys = [0.0, 1.0, 0.0, 1.0]

In [218]:
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=0.16677073565294676, grad=0)

In [219]:
ypred = [0.6543, -0.111, 0.222, 0.9999]
loss = sum(-ygt * math.log(Value(yout).sig().data) - (1.0 - ygt) * math.log(1.0 - Value(yout).sig().data) for ygt, yout in zip(ys, ypred)) / len(ys)
loss

0.7366631831666669

In [220]:
loss = binary_cross_entropy_loss(ys, [Value(y) for y in ypred])
loss.data

0.736663183166667

In [221]:
import torch

xs_t = torch.tensor(xs)
ys_t = torch.tensor(ys)
ypred_t = torch.tensor(ypred)

criterion = torch.nn.BCEWithLogitsLoss()
criterion(ypred_t, ys_t).item()

0.7366632223129272

Der Output von dem Modell sind erstmal die Logits, die jeden Wert annehmen können. Die Loss-Funktion wandelt durch die Anwendung von Sigmoid die Logits automatisch in Wahrscheinlichkeiten (bzw. einen Wertebereich von 0 bis 1) um und berechnet im Fall von BCE den Neg-Log der Wahrscheinlichkeiten. Der Grund für die Logits ist, dass die Sigmoid-Funktion immer 1 sein kann wenn ein Wert 10, 100 oder noch höher ist. Das stört das Lernen per Backpropagation. Daher ist es sinnvoll, wenn das Modell erstmal die ungequetschen Werte ausgibt.

In [223]:
steps = 1000
lr = 0.1

for _ in range(steps):
  # forward
  ypred = [n(x) for x in xs]
  loss = binary_cross_entropy_loss(ys, ypred)
  
  # backward
  n.zero_grad()
  loss.backward()
  
  # update
  for p in n.parameters():
    p.data += -lr * p.grad

print('loss', loss.data)
print('actual\t pred\t prob\n' + '\n'.join(f'{ygt} \t {0.0 if yout.data <= 0.0 else 1.0} \t {1 - yout.sig().data if yout.data <= 0 else yout.sig().data}' for ygt, yout in zip(ys, ypred)))


loss 0.01363693008633678
actual	 pred	 prob
0.0 	 0.0 	 0.9842084533047868
1.0 	 1.0 	 0.9919085335388027
0.0 	 0.0 	 0.988911763108392
1.0 	 1.0 	 0.9808305079696609
