In [1]:
import torch
import numpy as np

In [None]:

class Tensor:
    def __init__(self, data, children=(), _op="", label="", grad=None,requires_grad=True):
        # ensure numpy array
        self.data = np.array(data, dtype=float)
        self.grad = None if not requires_grad else (np.zeros_like(self.data) if grad is None else grad)
        self.children = children
        self._op = _op
        self.label = label
        self._backward = lambda: None
        self.shape=self.data.shape
        self.requires_grad=requires_grad

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad}, op={self._op}, label={self.label})"

    # --- elementwise ops ---
    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data + other.data, (self, other), _op="+")
        if self.requires_grad or other.requires_grad:
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                if self.grad is None:
                    self.grad = np.zeros_like(self.data)
                # reduce grad shape if broadcasting happened
                grad_self = out.grad
                while grad_self.ndim > self.data.ndim:
                    grad_self = grad_self.sum(axis=0)
                for i, dim in enumerate(self.data.shape):
                    if dim == 1:
                        grad_self = grad_self.sum(axis=i, keepdims=True)
                self.grad += grad_self

            if other.requires_grad:
                if other.grad is None:
                    other.grad = np.zeros_like(other.data)
                grad_other = out.grad
                while grad_other.ndim > other.data.ndim:
                    grad_other = grad_other.sum(axis=0)

                for i, dim in enumerate(other.data.shape):
                    if dim == 1:
                        grad_other = grad_other.sum(axis=i, keepdims=True)
                other.grad += grad_other

        out._backward = _backward
        return out
    
    def __radd__(self, other):
        # Just reverse the order: addition is commutative
        return self + other
    
    
    def __sub__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data - other.data, (self, other), _op="-")
        if self.requires_grad or other.requires_grad:
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += out.grad
            if other.requires_grad:
                other.grad += -out.grad
        out._backward = _backward
        return out
    
    def __rsub__(self, other):
        # Just reverse the order: subtraction is commutative
        return self - other
    
    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, (self, other), _op="*")
        if self.requires_grad or other.requires_grad:
            out.requires_grad=True

        def _backward():
            # gradient wrt self
            grad_self = other.data * out.grad
            while grad_self.ndim > self.data.ndim:
                grad_self = grad_self.sum(axis=0)
            for i, dim in enumerate(self.data.shape):
                if dim == 1:
                    grad_self = grad_self.sum(axis=i, keepdims=True)
            if self.requires_grad:
                self.grad += grad_self

            # gradient wrt other
            grad_other = self.data * out.grad
            while grad_other.ndim > other.data.ndim:
                grad_other = grad_other.sum(axis=0)
            for i, dim in enumerate(other.data.shape):
                if dim == 1:
                    grad_other = grad_other.sum(axis=i, keepdims=True)
            if other.requires_grad:
                other.grad += grad_other

        out._backward = _backward
        return out

    
    def __rmul__(self, other):
        # Just reverse the order: multiplication is commutative
        return self * other

    def __truediv__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data / other.data, (self, other), _op="/")
        if self.requires_grad or other.requires_grad:
            out.requires_grad=True

        def _backward():
            # dX = 1/y * dZ
            if self.requires_grad:
                self.grad += (1 / other.data) * out.grad
            # dY = -x / y^2 * dZ
            if other.requires_grad:
                other.grad += (-self.data / (other.data ** 2)) * out.grad

        out._backward = _backward
        return out
    
    def __rtruediv__(self, other):
        # Just reverse the order: division is commutative
        return self / other


    # --- reductions ---
    def sum(self):
        out = Tensor(self.data.sum(), (self,), _op="sum")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += np.ones_like(self.data) * out.grad
        out._backward = _backward
        return out

    def mean(self):
        out = Tensor(self.data.mean(), (self,), _op="mean")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += np.ones_like(self.data) * out.grad / self.data.size
        out._backward = _backward
        return out

    # --- matrix multiplication ---
    def matmul(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data.dot(other.data), (self, other), _op="matmul")
        if self.requires_grad or other.requires_grad:
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += out.grad.dot(other.data.T)
            if out.requires_grad:
                other.grad += self.data.T.dot(out.grad)
        out._backward = _backward
        return out
    def __matmul__(self, other):
        return self.matmul(other)

    def __rmatmul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        return other.matmul(self)
    
    def pow(self,other):
        if not isinstance(other, (int, float)):
            raise Exception("other must be int or float")
        if self.requires_grad :
            out.requires_grad=True

        out = Tensor(np.power(self.data,other), (self,), _op="pow")
        def _backward():
            if self.requires_grad:
                self.grad += (other*self.data**(other-1))*out.grad
        out._backward = _backward
        return out
    
    def sigmoid(self):
        out = Tensor(1/(1+np.exp(-self.data)), (self,), _op="sigmoid")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += out.data*(1-out.data)*out.grad
        out._backward = _backward
        return out
    
    def relu(self):
        out = Tensor(np.maximum(0,self.data), (self,), _op="relu")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += (self.data > 0).astype(float)*out.grad
        out._backward = _backward
        return out
    
    def tanh(self):
        out = Tensor(np.tanh(self.data), (self,), _op="tanh")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += (1-out.data**2)*out.grad
        out._backward = _backward
        return out
    

    def exp(self):
        out  = Tensor(np.exp(self.data), (self,), _op="exp")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += out.data*out.grad
        out._backward = _backward
        return out
    
    def cos(self):
        out  = Tensor(np.cos(self.data), (self,), _op="cos")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += -np.sin(self.data)*out.grad
        out._backward = _backward
        return out
    
    def sin(self):
        out  = Tensor(np.sin(self.data), (self,), _op="sin")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += np.cos(self.data)*out.grad
        out._backward = _backward
        return out
    
    def ln(self):
        out  = Tensor(np.log(self.data), (self,), _op="log")
        if self.requires_grad :
            out.requires_grad=True
        def _backward():
            if self.requires_grad:
                self.grad += 1/self.data*out.grad
        out._backward = _backward
        return out
    def log(self,base):
        if base is not isinstance(base,(int,float)):
            raise Exception("base must be int or float")
        out = Tensor(np.log(self.data) / np.log(base), (self,), _op=f"log_{base}")
        if self.requires_grad :
            out.requires_grad=True

        def _backward():
            if self.requires_grad:
                self.grad += (1 / (self.data * np.log(base))) * out.grad

        out._backward = _backward
        return out
    @staticmethod
    def _match_shape(grad, shape):
        """
        Reduce grad to match the given shape by summing over broadcasted axes.
        """
        while grad.ndim > len(shape):
            grad = grad.sum(axis=0)

        for i, dim in enumerate(shape):
            if dim == 1:
                grad = grad.sum(axis=i, keepdims=True)

        return grad



    def backward(self):
        topo = []
        visited = set()
        if not self.requires_grad:
                return print("no grad please set requires_grad=True")
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v.children:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        self.grad = np.ones_like(self.data)

        for node in reversed(topo):
            node._backward()


In [574]:
a = Tensor([[1,2,3]],requires_grad=True)
c = a+5
print(c)

Tensor(data=[[6. 7. 8.]], grad=[[0. 0. 0.]], op=+, label=)


In [575]:
c.backward()
print(a.grad)

before summing grad_other:  2
other.data.ndim:  0
after summing grad_other:  1
before summing grad_other:  1
other.data.ndim:  0
after summing grad_other:  0
other.data.shape ()
[[1. 1. 1.]]


In [547]:
a= Tensor([[-1,2,3]],requires_grad=True)
b = Tensor([[1,2,3]])
c = a+b
print(c) #c


Tensor(data=[[0. 4. 6.]], grad=[[0. 0. 0.]], op=+, label=)


In [548]:
c.backward()

In [549]:
b

Tensor(data=[[1. 2. 3.]], grad=[[1. 1. 1.]], op=, label=)

In [550]:
a = Tensor([[1,2,3]],requires_grad=True)
b = Tensor([[1,2,3]],requires_grad=True)
c = a*b
d=c+5
e = d.sum()

In [551]:
e.backward()

In [555]:
d

Tensor(data=[[ 6.  9. 14.]], grad=[[1. 1. 1.]], op=+, label=)

In [379]:
import torch
import numpy as np

# --- My Tensor example ---
A = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]))
B = Tensor(np.array([[5.0, 6.0], [7.0, 8.0]]))

# Addition
C = A + B
# Subtraction
D = C - A
# Matrix multiplication
E = D.matmul(B)
# Sum to make scalar for backward
loss = E.sum()
loss.backward()

print("My Tensor grad A:\n", A.grad)
print("My Tensor grad B:\n", B.grad)

# --- PyTorch equivalent ---
A_torch = torch.tensor([[1.0,2.0],[3.0,4.0]], requires_grad=True)
B_torch = torch.tensor([[5.0,6.0],[7.0,8.0]], requires_grad=True)

C_torch = A_torch + B_torch
D_torch = C_torch - A_torch
E_torch = D_torch @ B_torch
loss_torch = E_torch.sum()
loss_torch.backward()

print("PyTorch grad A:\n", A_torch.grad)
print("PyTorch grad B:\n", B_torch.grad)


My Tensor grad A:
 [[0. 0.]
 [0. 0.]]
My Tensor grad B:
 [[23. 27.]
 [25. 29.]]
PyTorch grad A:
 tensor([[0., 0.],
        [0., 0.]])
PyTorch grad B:
 tensor([[23., 27.],
        [25., 29.]])


In [380]:
import torch
import numpy as np

# Helper to compare gradients
def compare_grad(tensor, torch_tensor, tol=1e-6):
    diff = np.abs(tensor.grad - torch_tensor.grad.numpy())
    print("Grad difference:\n", diff)
    assert np.allclose(tensor.grad, torch_tensor.grad.numpy(), atol=tol), "Gradients do not match!"

# -------------------
# 1️⃣ Matrix Addition + exp
print("=== Addition + exp ===")
A = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]))
B = Tensor(np.array([[5.0, 6.0], [7.0, 8.0]]))

C = (A + B).exp()
loss = C.sum()
loss.backward()
print("My Tensor grad A:\n", A.grad)
print("My Tensor grad B:\n", B.grad)

# PyTorch
A_t = torch.tensor([[1.0,2.0],[3.0,4.0]], requires_grad=True)
B_t = torch.tensor([[5.0,6.0],[7.0,8.0]], requires_grad=True)
C_t = torch.exp(A_t + B_t)
loss_t = C_t.sum()
loss_t.backward()
print("PyTorch grad A:\n", A_t.grad)
print("PyTorch grad B:\n", B_t.grad)
compare_grad(A, A_t)
compare_grad(B, B_t)

# -------------------
# 2️⃣ Matrix Subtraction + sigmoid
print("\n=== Subtraction + sigmoid ===")
A = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]))
B = Tensor(np.array([[5.0, 6.0], [7.0, 8.0]]))

C = (A - B).sigmoid()
loss = C.sum()
loss.backward()
print("My Tensor grad A:\n", A.grad)
print("My Tensor grad B:\n", B.grad)

# PyTorch
A_t = torch.tensor([[1.0,2.0],[3.0,4.0]], requires_grad=True)
B_t = torch.tensor([[5.0,6.0],[7.0,8.0]], requires_grad=True)
C_t = torch.sigmoid(A_t - B_t)
loss_t = C_t.sum()
loss_t.backward()
print("PyTorch grad A:\n", A_t.grad)
print("PyTorch grad B:\n", B_t.grad)
compare_grad(A, A_t)
compare_grad(B, B_t)

# -------------------
# 3️⃣ Matrix Multiplication + relu
print("\n=== Matmul + relu ===")
A = Tensor(np.array([[1.0, 2.0], [3.0, 4.0]]))
B = Tensor(np.array([[5.0, 6.0], [7.0, 8.0]]))

C = A.matmul(B).relu()
loss = C.sum()
loss.backward()
print("My Tensor grad A:\n", A.grad)
print("My Tensor grad B:\n", B.grad)

# PyTorch
A_t = torch.tensor([[1.0,2.0],[3.0,4.0]], requires_grad=True)
B_t = torch.tensor([[5.0,6.0],[7.0,8.0]], requires_grad=True)
C_t = torch.nn.functional.relu(A_t @ B_t)
loss_t = C_t.sum()
loss_t.backward()
print("PyTorch grad A:\n", A_t.grad)
print("PyTorch grad B:\n", B_t.grad)
compare_grad(A, A_t)
compare_grad(B, B_t)


=== Addition + exp ===
My Tensor grad A:
 [[   403.42879349   2980.95798704]
 [ 22026.46579481 162754.791419  ]]
My Tensor grad B:
 [[   403.42879349   2980.95798704]
 [ 22026.46579481 162754.791419  ]]
PyTorch grad A:
 tensor([[   403.4288,   2980.9580],
        [ 22026.4648, 162754.7969]])
PyTorch grad B:
 tensor([[   403.4288,   2980.9580],
        [ 22026.4648, 162754.7969]])
Grad difference:
 [[8.99749926e-06 2.07707717e-05]
 [9.51056718e-04 5.45599608e-03]]
Grad difference:
 [[8.99749926e-06 2.07707717e-05]
 [9.51056718e-04 5.45599608e-03]]

=== Subtraction + sigmoid ===
My Tensor grad A:
 [[0.01766271 0.01766271]
 [0.01766271 0.01766271]]
My Tensor grad B:
 [[-0.01766271 -0.01766271]
 [-0.01766271 -0.01766271]]
PyTorch grad A:
 tensor([[0.0177, 0.0177],
        [0.0177, 0.0177]])
PyTorch grad B:
 tensor([[-0.0177, -0.0177],
        [-0.0177, -0.0177]])
Grad difference:
 [[3.59709689e-10 3.59709689e-10]
 [3.59709689e-10 3.59709689e-10]]
Grad difference:
 [[3.59709689e-10 3.597096

In [381]:
x = Tensor(np.random.randn(2,3))
y = Tensor(np.random.randn(3,2))
print(x@y)

Tensor(data=[[-1.75973585  3.07053341]
 [ 1.27719987 -1.06032032]], grad=[[0. 0.]
 [0. 0.]], op=matmul, label=)


In [393]:
class Linear():
    def __init__(self, in_features, out_features):
        self.weight = Tensor(np.random.randn(in_features, out_features))
        self.bias = Tensor(np.random.randn(out_features))

    def forward(self, x):
        return x @ self.weight + self.bias
    def __call__(self, x):
        return self.forward(x)
    def parameters(self):
        return [self.weight, self.bias]

In [394]:
layer = Linear(3,2)
x = Tensor(np.random.randn(2,3))
y = layer(x)
print(y)

Tensor(data=[[-2.45803963  3.11235204]
 [-1.14434508 -2.96598593]], grad=[[0. 0.]
 [0. 0.]], op=+, label=)


In [396]:
layer.parameters()

[Tensor(data=[[-0.95314974 -0.16823208]
  [-0.31622076  0.4187029 ]
  [ 0.27073493 -2.23905362]], grad=[[0. 0.]
  [0. 0.]
  [0. 0.]], op=, label=),
 Tensor(data=[-1.58061987  0.06439794], grad=[0. 0.], op=, label=)]

In [None]:
class Network():
    def __init__(self,in_features, hidden_features, out_features):
        self.linear1 = Linear(in_features, out_features)
        self.linear2 = Linear(hidden_features, out_features)

    def forward(self, x):
        x = self.linear1(x)
        x = x.relu()
        x = self.linear2(x)
        x = x.sigmoid()
        return x
    def __call__(self, x):
        return self.forward(x)
    def parameters(self):
        return self.linear1.parameters() + self.linear2.parameters()
        

In [441]:
nn = Network(1,2,1)
x = Tensor(np.random.randn(1,1))
y = nn(x)

In [442]:
nn.parameters()

[Tensor(data=[[-0.75620544]], grad=[[0.]], op=, label=),
 Tensor(data=[1.20874395], grad=[0.], op=, label=)]

In [386]:
class MSE():
    def __init__(self):
        pass
    def __call__(self, y_true, y_pred):
        return (y_true - y_pred).pow(2).mean()


In [387]:
input = Tensor(data=[[6]])

w =  Tensor(data=[[2]])
target = input * w
target

Tensor(data=[[12.]], grad=[[0.]], op=*, label=)

In [420]:
nn   = Network(1,5,1)
predictions = nn(input)

In [421]:
criterion = MSE()
loss = criterion(predictions,target)

In [426]:
loss

Tensor(data=121.01510048403132, grad=1.0, op=mean, label=)

In [425]:
loss.backward()

In [424]:
parameters = nn.parameters()

In [410]:
parameters

[Tensor(data=[[ 0.44723624  1.82550884  0.65553088 -0.32617895 -0.27775964]], grad=[[ 2.99988889 -1.83103454  0.37204904  0.          0.        ]], op=, label=),
 Tensor(data=[-1.95929713  0.47129701 -1.69992979 -0.8358522  -0.34773144], grad=[ 0.49998148 -0.30517242  0.06200817  0.          0.        ], op=, label=),
 Tensor(data=[[-0.74673449]
  [ 0.45578243]
  [-0.09261071]
  [ 0.75263338]
  [ 0.94824092]], grad=[[-0.48483999]
  [-7.64925631]
  [-1.4952924 ]
  [ 0.        ]
  [ 0.        ]], op=, label=),
 Tensor(data=[-1.02810542], grad=[-0.66955724], op=, label=)]

In [417]:
for p in parameters:
    p.data -= p.grad * 0.01
    p.grad = np.zeros_like(p.grad)

In [418]:
parameters

[Tensor(data=[[ 0.38723846  1.86212954  0.6480899  -0.32617895 -0.27775964]], grad=[[0. 0. 0. 0. 0.]], op=, label=),
 Tensor(data=[-1.96929676  0.47740046 -1.70116996 -0.8358522  -0.34773144], grad=[0. 0. 0. 0. 0.], op=, label=),
 Tensor(data=[[-0.73703769]
  [ 0.60876755]
  [-0.06270486]
  [ 0.75263338]
  [ 0.94824092]], grad=[[0.]
  [0.]
  [0.]
  [0.]
  [0.]], op=, label=),
 Tensor(data=[-1.01471427], grad=[0.], op=, label=)]

## ALL IN ONE ##

In [447]:
class SmallNetwork():
    def __init__(self,in_features, out_features):
        self.linear1 = Linear(in_features, out_features)
        
    def forward(self, x):
        x = self.linear1(x)
        return x
    
    def __call__(self, x):
        return self.forward(x)
    def parameters(self):
        return self.linear1.parameters() 

In [465]:
nn   = SmallNetwork(1,1)
input = Tensor(data=[[6]])
target = input * w
criterion = MSE()

for i in range(200):

    predictions = nn(input)
    loss = criterion(predictions,target)
    loss.backward()
    print("loss: ",f"{loss.data:4f}","weights",f"{nn.parameters()[0].data.tolist()[0][0]:3f}","grad",f"{nn.parameters()[0].grad.tolist()[0][0]:3f}")
    for p in nn.parameters():
        p.data -= p.grad * 0.01
        p.grad = np.zeros_like(p.grad)

predictions = nn(input)

loss:  330.129313 weights -0.981568 grad -218.033532
loss:  22.316742 weights 1.198767 grad -56.688718
loss:  1.508612 weights 1.765654 grad -14.739067
loss:  0.101982 weights 1.913045 grad -3.832157
loss:  0.006894 weights 1.951366 grad -0.996361
loss:  0.000466 weights 1.961330 grad -0.259054
loss:  0.000032 weights 1.963920 grad -0.067354
loss:  0.000002 weights 1.964594 grad -0.017512
loss:  0.000000 weights 1.964769 grad -0.004553
loss:  0.000000 weights 1.964815 grad -0.001184
loss:  0.000000 weights 1.964826 grad -0.000308
loss:  0.000000 weights 1.964830 grad -0.000080
loss:  0.000000 weights 1.964830 grad -0.000021
loss:  0.000000 weights 1.964831 grad -0.000005
loss:  0.000000 weights 1.964831 grad -0.000001
loss:  0.000000 weights 1.964831 grad -0.000000
loss:  0.000000 weights 1.964831 grad -0.000000
loss:  0.000000 weights 1.964831 grad -0.000000
loss:  0.000000 weights 1.964831 grad -0.000000
loss:  0.000000 weights 1.964831 grad -0.000000
loss:  0.000000 weights 1.964831

In [461]:
nn.parameters()[0].grad.tolist()[0][0]

0.0