From 24bc12e5a2e9145577e501ba39c02a7c2ea0510b Mon Sep 17 00:00:00 2001 From: __mo_san__ <50895527+m0saan@users.noreply.github.com> Date: Fri, 2 Jun 2023 09:11:22 +0100 Subject: [PATCH] fix the data method of Tensor + support new Operators --- _docs/autograd.html | 270 +++- _docs/operators.html | 722 +++++++++- _docs/search.json | 74 +- _docs/sitemap.xml | 6 +- _proc/00_autograd.ipynb | 286 +++- _proc/01_operators.ipynb | 1072 +++++++++++++- minima/_modidx.py | 6 + minima/autograd.py | 178 ++- minima/operators.py | 15 +- nbs/00_autograd.ipynb | 2911 +++++++------------------------------- nbs/01_operators.ipynb | 915 +++++++++++- ops.ipynb | 2197 ++++++++++++++++++++++++++++ 12 files changed, 6097 insertions(+), 2555 deletions(-) create mode 100644 ops.ipynb diff --git a/_docs/autograd.html b/_docs/autograd.html index c274b41..e969fd0 100644 --- a/_docs/autograd.html +++ b/_docs/autograd.html @@ -655,7 +655,7 @@

Value

Attributes: - data (float): the scalar value associated with this node - grad (float): the gradient of the output of the computational graph w.r.t. this node’s value - label (str): a label for this node, used for debugging and visualization purposes - _op (str): a string representation of the operation that produced this node in the computational graph - _prev (set of Value objects): the set of nodes that contributed to the computation of this node - _backward (function): a function that computes the gradients of this node w.r.t. its inputs

Methods: - init(self, data, children=(), op=’‘, label=’’): Initializes a Value object with the given data, children, op, and label - repr(self): Returns a string representation of this Value object - add(self, other): Implements the addition operation between two Value objects - mul(self, other): Implements the multiplication operation between two Value objects - item(self): Returns the scalar value associated with this Value object - tanh(self): Applies the hyperbolic tangent function to this Value object and returns a new Value object


-

source

+

source

all_devices

@@ -664,7 +664,7 @@

all_devices

return a list of all available devices


-

source

+

source

cpu

@@ -673,7 +673,7 @@

cpu

Return cpu device


-

source

+

source

CPUDevice

@@ -682,7 +682,7 @@

CPUDevice

Represents data that sits in CPU


-

source

+

source

Device

@@ -691,7 +691,7 @@

Device

Indicates the device supporting an NDArray.


-

source

+

source

Operator

@@ -700,7 +700,7 @@

Operator

Initialize self. See help(type(self)) for accurate signature.


-

source

+

source

TensorOp

@@ -734,7 +734,7 @@

TensorOp

self.cached_data = cached_data self.requires_grad = requires_grad
-

source

+

source

Tensor

@@ -746,6 +746,262 @@

Tensor

Attributes: - data: The actual data of the tensor. It is computed lazily. - children: Other tensors that this tensor depends on for computing its value. - requires_grad: Whether this tensor needs to compute gradients.

Methods: - realize_data: Computes and returns the actual data for this tensor. - shape: Returns the shape of this tensor. - dtype: Returns the data type of this tensor.

Example: >>> t1 = Tensor([[1.0, 2.0], [3.0, 4.0]]) >>> print(t1.shape) (2, 2) >>> print(t1.dtype) float64

+
+
import numpy as np
+import unittest
+from minima.autograd import Tensor
+
+class TestTensor(unittest.TestCase):
+    
+    def test_create_tensor(self):
+        t1 = Tensor([1, 2, 3])
+        self.assertTrue(np.array_equal(t1.realize_data(), np.array([1, 2, 3])))
+        self.assertEqual(t1.shape, (3,))
+        self.assertEqual(t1.dtype, np.float64)
+        
+        t2 = Tensor([[1, 2], [3, 4]])
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([[1, 2], [3, 4]])))
+        self.assertEqual(t2.shape, (2, 2))
+        self.assertEqual(t2.dtype, np.float64)
+        
+        t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)
+        self.assertTrue(np.array_equal(t3.realize_data(), np.array([1, 2, 3], dtype=np.int32)))
+        self.assertEqual(t3.shape, (3,))
+        self.assertEqual(t3.dtype, np.int32)
+        
+    def test_create_tensor_from_tensor(self):
+        t1 = Tensor([1, 2, 3])
+        t2 = Tensor(t1)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3])))
+        self.assertEqual(t2.shape, (3,))
+        self.assertEqual(t2.dtype, np.float64)
+        
+        t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)
+        t4 = Tensor(t3)
+        self.assertTrue(np.array_equal(t4.realize_data(), np.array([1, 2, 3], dtype=np.int32)))
+        self.assertEqual(t4.shape, (3,))
+        self.assertEqual(t4.dtype, np.int32)
+        
+    def test_create_tensor_with_device(self):
+        t1 = Tensor([1, 2, 3], device='cpu')
+        self.assertEqual(t1.device, 'cpu')
+        
+        t2 = Tensor([1, 2, 3], device='cuda')
+        self.assertEqual(t2.device, 'cuda')
+        
+    def test_create_tensor_with_requires_grad(self):
+        t1 = Tensor([1, 2, 3], requires_grad=True)
+        self.assertTrue(t1.requires_grad)
+        
+        t2 = Tensor([1, 2, 3], requires_grad=False)
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_with_kwargs(self):
+        t1 = Tensor([1, 2, 3], device='cuda', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t1.device, 'cuda')
+        self.assertEqual(t1.dtype, np.float32)
+        self.assertTrue(t1.requires_grad)
+        
+    def test_create_tensor_from_numpy(self):
+        np_array = np.array([1, 2, 3])
+        t1 = Tensor(np_array)
+        self.assertTrue(np.array_equal(t1.realize_data(), np_array))
+        self.assertEqual(t1.shape, (3,))
+        self.assertEqual(t1.dtype, np.float64)
+        
+        np_array = np.array([1, 2, 3], dtype=np.int32)
+        t2 = Tensor(np_array)
+        self.assertTrue(np.array_equal(t2.realize_data(), np_array))
+        self.assertEqual(t2.shape, (3,))
+        self.assertEqual(t2.dtype, np.int32)
+        
+    def test_create_tensor_from_numpy_with_device(self):
+        np_array = np.array([1, 2, 3])
+        t1 = Tensor(np_array, device='cuda')
+        self.assertEqual(t1.device, 'cuda')
+        
+        np_array = np.array([1, 2, 3], dtype=np.int32)
+        t2 = Tensor(np_array, device='cuda')
+        self.assertEqual(t2.device, 'cuda')
+        
+    def test_create_tensor_from_numpy_with_requires_grad(self):
+        np_array = np.array([1, 2, 3])
+        t1 = Tensor(np_array, requires_grad=True)
+        self.assertTrue(t1.requires_grad)
+        
+        np_array = np.array([1, 2, 3], dtype=np.int32)
+        t2 = Tensor(np_array, requires_grad=False)
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_numpy_with_kwargs(self):
+        np_array = np.array([1, 2, 3])
+        t1 = Tensor(np_array, device='cuda', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t1.device, 'cuda')
+        self.assertEqual(t1.dtype, np.float32)
+        self.assertTrue(t1.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_device(self):
+        t1 = Tensor([1, 2, 3], device='cpu')
+        t2 = Tensor(t1, device='cuda')
+        self.assertEqual(t2.device, 'cuda')
+        
+    def test_create_tensor_from_tensor_with_requires_grad(self):
+        t1 = Tensor([1, 2, 3], requires_grad=True)
+        t2 = Tensor(t1, requires_grad=False)
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_kwargs(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)
+        self.assertEqual(t2.device, 'cpu')
+        self.assertEqual(t2.dtype, np.float32)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertTrue(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertFalse(t2.requires_grad)
+        
+    def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):
+        t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)
+        t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)
+        self.assertEqual(t2.device, 'cuda')
+        self.assertEqual(t2.dtype, np.float64)
+        self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))
+        self.assertTrue(t2.requires_grad)
+
+
+
import nbdev; nbdev.nbdev_export()
+
diff --git a/_docs/operators.html b/_docs/operators.html index 3e5c2b9..14444f6 100644 --- a/_docs/operators.html +++ b/_docs/operators.html @@ -21,6 +21,69 @@ margin: 0 0.8em 0.2em -1.6em; vertical-align: middle; } +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code + { counter-reset: source-line 0; } +pre.numberSource code > span + { position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before + { content: counter(source-line); + position: relative; left: -1em; text-align: right; vertical-align: baseline; + border: none; display: inline-block; + -webkit-touch-callout: none; -webkit-user-select: none; + -khtml-user-select: none; -moz-user-select: none; + -ms-user-select: none; user-select: none; + padding: 0 4px; width: 4em; + color: #aaaaaa; + } +pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } +div.sourceCode + { } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} +code span.al { color: #ff0000; font-weight: bold; } /* Alert */ +code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ +code span.at { color: #7d9029; } /* Attribute */ +code span.bn { color: #40a070; } /* BaseN */ +code span.bu { color: #008000; } /* BuiltIn */ +code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ +code span.ch { color: #4070a0; } /* Char */ +code span.cn { color: #880000; } /* Constant */ +code span.co { color: #60a0b0; font-style: italic; } /* Comment */ +code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ +code span.do { color: #ba2121; font-style: italic; } /* Documentation */ +code span.dt { color: #902000; } /* DataType */ +code span.dv { color: #40a070; } /* DecVal */ +code span.er { color: #ff0000; font-weight: bold; } /* Error */ +code span.ex { } /* Extension */ +code span.fl { color: #40a070; } /* Float */ +code span.fu { color: #06287e; } /* Function */ +code span.im { color: #008000; font-weight: bold; } /* Import */ +code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ +code span.kw { color: #007020; font-weight: bold; } /* Keyword */ +code span.op { color: #666666; } /* Operator */ +code span.ot { color: #007020; } /* Other */ +code span.pp { color: #bc7a00; } /* Preprocessor */ +code span.sc { color: #4070a0; } /* SpecialChar */ +code span.ss { color: #bb6688; } /* SpecialString */ +code span.st { color: #4070a0; } /* String */ +code span.va { color: #19177c; } /* Variable */ +code span.vs { color: #4070a0; } /* VerbatimString */ +code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ @@ -60,6 +123,7 @@ } } + @@ -123,14 +187,32 @@

operators

On this page

@@ -160,13 +242,43 @@

operators

+

The out_grad parameter refers to the gradient of the loss function with respect to the output of the node. Multiplying this with the local gradient gives the gradient of the loss with respect to the input to the node, according to the chain rule of calculus, which is the basis for backpropagation in neural networks.

+

The chain rule is a fundamental concept in calculus that provides a method to compute the derivative of composite functions. In simple terms, the chain rule states that the derivative of a composite function is the derivative of the outer function multiplied by the derivative of the inner function.

+

Given a composite function that is the composition of two functions, say, \(f(g(x))\), the chain rule can be stated as follows:

+

\[\frac{df}{dx} = \frac{df}{dg} \cdot \frac{dg}{dx}\]

+

Where:

+ +

The chain rule can be extended to the case where we have more than two composite functions.

+
+

Element Wise Addition

+

Let’s walk through the step-by-step derivative calculation for the EWiseAdd operation:

+

We have the function f(a, b) = a + b, where a and b are tensors. Our goal is to compute the partial derivatives with respect to a and b.

+

Let’s start by calculating the derivative of f with respect to a, denoted as df/da:

+

Step 1: Compute the derivative of f with respect to a.

+

\(\frac{{\partial f}}{{\partial a}} = \frac{{\partial}}{{\partial a}} (a + b)\)

+

Since a is the variable we are differentiating with respect to, the derivative of a with respect to itself is 1:

+

\[\frac{{\partial f}}{{\partial a}} = 1\]

+

Therefore, \[\frac{{\partial f}}{{\partial a}} = 1.\]

+

Step 2: Compute the derivative of f with respect to b.

+

\[\frac{{\partial f}}{{\partial b}} = \frac{{\partial}}{{\partial b}} (a + b)\]

+

Again, since b is the variable we are differentiating with respect to, the derivative of b with respect to itself is 1:

+

\[\frac{{\partial f}}{{\partial b}} = 1\]

+

Therefore, \[\frac{{\partial f}}{{\partial b}} = 1\]

+

Hence, the partial derivatives of f(a, b) = a + b with respect to a and b are both equal to 1.


-

source

+

source

add

-
 add (a, b)
+
 add (a:minima.autograd.Tensor, b:minima.autograd.Tensor)
+

Adds two tensors element-wise.

+

Args: - a: The first tensor. - b: The second tensor.

+

Returns: The element-wise sum of a and b.


source

@@ -175,58 +287,626 @@

EWiseAdd

 EWiseAdd ()
-

Op class specialized to output tensors, will be alternate subclasses for other structures

-
-

source

+

Performs element-wise addition of two tensors.

+

Example: >>> a = Tensor([1, 2, 3]) >>> b = Tensor([4, 5, 6]) >>> op = EWiseAdd() >>> result = op.compute(a, b) >>> print(result) Tensor([5, 7, 9])

+
+
# Create two 1-D tensors
+a = Tensor([1, 2, 3])
+b = Tensor([4, 5, 6])
+
+# Create an EWiseAdd operation
+op = EWiseAdd()
+
+
+
+

Scalar Addition

+

Explanation for the derivative of the AddScalar operator:

+

Let’s denote the scalar as c and a as the tensor being added by the scalar. The operation can be described as f(a) = a + c.

+

The function for the backward pass (i.e., the gradient) is df/da = 1, which means the derivative of f(a) with respect to a is simply 1.

+

We are given a function \(f(a) = a + c\), where \(a\) is a tensor and \(c\) is a scalar. Our task is to find the derivative of this function with respect to \(a\).

+

By differentiating the function \(f(a)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (a + c) \\ +&= 1 +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(1\).

+

We starts by defining the function f(a) = a + c. It then explains that when we differentiate f(a) with respect to a, we find that the derivative is 1. This means that the gradient of f(a) with respect to a is 1, which matches the behavior of the AddScalar operator as provided in the gradient method.

+
+

source

add_scalar

-
 add_scalar (a, scalar)
+
 add_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])
+

Adds a scalar to a tensor.

+

Args: - a: The tensor. - scalar: The scalar to add.

+

Returns: The sum of a and the scalar.


-

source

+

source

AddScalar

-
 AddScalar (scalar)
+
 AddScalar (scalar:Union[int,float])
-

Op class specialized to output tensors, will be alternate subclasses for other structures

-
-

source

+

Performs addition of a tensor and a scalar.

+

Example: >>> a = Tensor([1, 2, 3]) >>> op = AddScalar(5) >>> result = op.compute(a) >>> print(result) Tensor([6, 7, 8])

+
+
+

Element Wise Multiplication

+

Explanation for the derivative of the EWiseMul (element-wise multiplication) operator:

+

Let’s denote the two input tensors as a and b. The operation can be described as f(a, b) = a * b, where * represents element-wise multiplication.

+

The function for the backward pass (i.e., the gradient) is df/da = b and df/db = a. This means that the derivative of f(a, b) with respect to a is b, and the derivative with respect to b is a.

+

We are given a function \(f(a, b) = a \odot b\), where \(a\) and \(b\) are tensors, and \(\odot\) represents element-wise multiplication. Our task is to find the derivatives of this function with respect to \(a\) and \(b\).

+

By differentiating the function \(f(a, b)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (a \odot b) \\ +&= b +\end{align*}\]

+

Therefore, the gradient of \(f(a, b)\) with respect to \(a\) is \(b\).

+

Similarly, by differentiating the function \(f(a, b)\) with respect to \(b\), we find:

+

\[\begin{align*} +\frac{df}{db} &= \frac{d}{db} (a \odot b) \\ +&= a +\end{align*}\]

+

Therefore, the gradient of \(f(a, b)\) with respect to \(b\) is \(a\).

+
+

source

multiply

-
 multiply (a, b)
+
 multiply (a:minima.autograd.Tensor, b:minima.autograd.Tensor)
+

Multiplies two tensors element-wise.

+

Args: - a: The first tensor. - b: The second tensor.

+

Returns: The element-wise product of a and b.


-

source

+

source

EWiseMul

 EWiseMul ()
-

Op class specialized to output tensors, will be alternate subclasses for other structures

-
-

source

+

Performs element-wise multiplication of two tensors.

+

Example: >>> a = Tensor([1, 2, 3]) >>> b = Tensor([4, 5, 6]) >>> op = EWiseMul() >>> result = op.compute(a, b) >>> print(result) Tensor([4, 10, 18])

+
+
+

Scalar Multiplication

+

Let’s denote the scalar as c and a as the tensor being multiplied by the scalar. The operation can be described as f(a) = a * c.

+

The function for the backward pass (i.e., the gradient) is df/da = c, which means the derivative of f(a) with respect to a is c.

+

The LaTeX document will look as follows:

+

We are given a function \(f(a) = a \cdot c\), where \(a\) is a tensor and \(c\) is a scalar. Our task is to find the derivative of this function with respect to \(a\).

+

By differentiating the function \(f(a)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (a \cdot c) \\ +&= c +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(c\).

+

We starts by defining the function f(a) = a * c. It then explains that when we differentiate f(a) with respect to a, we find that the derivative is c. This means that the gradient of f(a) with respect to a is c, which matches the behavior of the MulScalar operator as provided in the gradient method.

+
+

source

mul_scalar

-
 mul_scalar (a, scalar)
+
 mul_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])
+

Multiplies a tensor by a scalar.

+

Args: - a: The tensor. - scalar: The scalar to multiply.

+

Returns: The product of a and the scalar.


-

source

+

source

MulScalar

-
 MulScalar (scalar)
+
 MulScalar (scalar:Union[int,float])
-

Op class specialized to output tensors, will be alternate subclasses for other structures

+

Performs multiplication of a tensor and a scalar.

+

Example: >>> a = Tensor([1, 2, 3]) >>> op = MulScalar(5) >>> result = op.compute(a) >>> print(result) Tensor([5, 10, 15])

+
+
+
+

Negation

+

Let’s denote a as the tensor being negated. The operation can be described as f(a) = -a.

+

The function for the backward pass (i.e., the gradient) is df/da = -1.

+

We are given a function \(f(a) = -a\), where \(a\) is a tensor. Our task is to find the derivative of this function with respect to \(a\).

+

By differentiating the function \(f(a)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (-a) \\ +&= -1 +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(-1\).

+
+
class Negate(TensorOp):
+    """
+    Negates the given tensor.
+    
+    Example:
+    >>> a = Tensor([1, -2, 3])
+    >>> op = Negate()
+    >>> result = op.compute(a)
+    >>> print(result)
+    Tensor([-1, 2, -3])
+    """
+    
+    def compute(self, a: NDArray) -> NDArray:
+        """
+        Computes the negation of a tensor.
+
+        Args:
+        - a: The tensor to negate.
+
+        Returns:
+        The negation of a.
+        """
+        return -1 * a
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:
+        """
+        Computes the gradient of the negation operation.
+
+        Args:
+        - out_grad: The gradient of the output of the operation.
+        - node: The node in the computational graph where the operation was performed.
+
+        Returns:
+        The gradients with respect to the inputs.
+        """
+        return (negate(out_grad), )
+
+
+def negate(a: Tensor) -> Tensor:
+    """
+    Negates the given tensor.
+
+    Args:
+    - a: The tensor to negate.
+
+    Returns:
+    The negation of a.
+    
+    Example:
+    >>> a = Tensor([1, -2, 3])
+    >>> result = negate(a)
+    >>> print(result)
+    Tensor([-1, 2, -3])
+    """
+    return Negate()(a)
+
+
+
+

Exp

+

Explanation for the derivative of the Exp operator:

+

Let’s denote a as the tensor on which the exponential function is applied. The operation can be described as f(a) = exp(a), where exp represents the exponential function.

+

The function for the backward pass (i.e., the gradient) is df/da = exp(a).

+

We are given a function \(f(a) = \exp(a)\), where \(a\) is a tensor. Our task is to find the derivative of this function with respect to \(a\).

+

By differentiating the function \(f(a)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (\exp(a)) \\ +&= \exp(a) +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(\exp(a)\).

+
+
class Exp(TensorOp):
+    """
+    Calculates the exponential of the given tensor.
+    
+    Example:
+    >>> a = Tensor([1, 2, 3])
+    >>> op = Exp()
+    >>> result = op.compute(a)
+    >>> print(result)
+    Tensor([2.71828183, 7.3890561, 20.08553692])
+    """
+    
+    def compute(self, a: NDArray) -> NDArray:
+        """
+        Computes the exponential of a tensor.
+
+        Args:
+        - a: The tensor.
+
+        Returns:
+        The exponential of a.
+        """
+        self.out = array_api.exp(a)
+        return self.out
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:
+        """
+        Computes the gradient of the exponential operation.
+
+        Args:
+        - out_grad: The gradient of the output of the operation.
+        - node: The node in the computational graph where the operation was performed.
+
+        Returns:
+        The gradients with respect to the inputs.
+        """
+        return (out_grad * self.out, )
+
+def exp(a: Tensor) -> Tensor:
+    """
+    Calculates the exponential of the given tensor.
+
+    Args:
+    - a: The tensor.
+
+    Returns:
+    The exponential of a.
+    
+    Example:
+    >>> a = Tensor([1, 2, 3])
+    >>> result = exp(a)
+    >>> print(result)
+    Tensor([2.71828183, 7.3890561, 20.08553692])
+    """
+    return Exp()(a)
+
+
+
+

ReLU

+

The derivative of the ReLU (Rectified Linear Unit) operator:

+

Let’s denote a as the tensor on which the ReLU function is applied. The ReLU function is defined as follows:

+

\[ +f(a) = +\begin{cases} +a, & \text{if } a \geq 0 \\ +0, & \text{if } a < 0 +\end{cases} +\]

+

The function for the backward pass (i.e., the gradient) is df/da = 1 if a >= 0, and df/da = 0 if a < 0.

+

We are given a function \(f(a) = \max(0, a)\), where \(a\) is a tensor. Our task is to find the derivative of this function with respect to \(a\).

+

By considering the definition of the ReLU function, we can write \(f(a)\) as:

+

\[ +f(a) = +\begin{cases} +a, & \text{if } a \geq 0 \\ +0, & \text{if } a < 0 +\end{cases} +\]

+

Now, let’s differentiate \(f(a)\) with respect to \(a\):

+

\[ +\frac{df}{da} = +\begin{cases} +1, & \text{if } a \geq 0 \\ +0, & \text{if } a < 0 +\end{cases} +\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(1\) if \(a \geq 0\), and \(0\) if \(a < 0\).

+
+
class ReLU(TensorOp):
+    """
+    Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.
+    
+    Example:
+    >>> a = Tensor([1, -2, 3])
+    >>> op = ReLU()
+    >>> result = op.compute(a)
+    >>> print(result)
+    Tensor([1, 0, 3])
+    """
+    
+    def compute(self, a: NDArray) -> NDArray:
+        """
+        Computes the ReLU activation function on a tensor.
+
+        Args:
+        - a: The tensor.
+
+        Returns:
+        The result of applying ReLU to a.
+        """
+        self.out = array_api.clip(a, a_min=0)
+        return self.out
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:
+        """
+        Computes the gradient of the ReLU operation.
+
+        Args:
+        - out_grad: The gradient of the output of the operation.
+        - node: The node in the computational graph where the operation was performed.
+
+        Returns:
+        The gradients with respect to the inputs.
+        """
+        return (out_grad * Tensor(node.children[0] >= 0), )
+
+def relu(a: Tensor) -> Tensor:
+    """
+    Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.
+
+    Args:
+    - a: The tensor.
+
+    Returns:
+    The result of applying ReLU to a.
+    
+    Example:
+    >>> a = Tensor([1, -2, 3])
+    >>> result = relu(a)
+    >>> print(result)
+    Tensor([1, 0, 3])
+    """
+    return ReLU()(a)
+
+
+
+

Power Scalar

+

The derivative of the PowerScalar operator:

+

Let’s denote the scalar as n and a as the tensor being raised to the power of the scalar. The operation can be described as f(a) = a^n.

+

The function for the backward pass (i.e., the gradient) is df/da = n * a^(n-1).

+

We are given a function \(f(a) = a^n\), where \(a\) is a tensor and \(n\) is a scalar. Our task is to find the derivative of this function with respect to \(a\).

+

By differentiating the function \(f(a)\) with respect to \(a\), we find:

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (a^n) \\ +&= n \cdot a^{n-1} +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(n \cdot a^{n-1}\).

+
+
class PowerScalar(TensorOp):
+    """
+    The PowerScalar operation raises a tensor to an (integer) power.
+
+    Attributes:
+        scalar (int): The power to raise the tensor to.
+
+    Example:
+        >>> import numpy as np
+        >>> tensor = Tensor(np.array([1, 2, 3]))
+        >>> pow_scalar = PowerScalar(2)
+        >>> result = pow_scalar.compute(tensor.data)
+        >>> print(result)
+        array([1, 4, 9])
+
+    """
+
+    def __init__(self, scalar: int):
+        """
+        Constructs the PowerScalar operation.
+
+        Args:
+            scalar (int): The power to raise the tensor to.
+        """
+        self.scalar = scalar
+
+    def compute(self, a: NDArray) -> NDArray:
+        """
+        Computes the power operation on the input tensor.
+
+        Args:
+            a (NDArray): The input tensor.
+
+        Returns:
+            NDArray: The resulting tensor after the power operation.
+        """
+        return array_api.power(a, self.scalar)
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ]:
+        """
+        Computes the gradient of the power operation.
+
+        Args:
+            out_grad (Tensor): The gradient of the output tensor.
+            node (Tensor): The node in the computational graph where the operation was performed.
+
+        Returns:
+            Tuple[Tensor, ]: The gradient with respect to the input tensor.
+        """
+        a = node.children[0]
+        return (self.scalar * power_scalar(a, self.scalar - 1) * out_grad, )
+
+
+def power_scalar(a: Tensor, scalar: int) -> Tensor:
+    """
+    Raises a tensor to a power.
+
+    Args:
+        a (Tensor): The input tensor.
+        scalar (int): The power to raise the tensor to.
+
+    Returns:
+        Tensor: The resulting tensor after the power operation.
+
+    Example:
+        >>> import numpy as np
+        >>> tensor = Tensor(np.array([1, 2, 3]))
+        >>> result = power_scalar(tensor, 2)
+        >>> print(result)
+        Tensor([1, 4, 9])
+    """
+    return PowerScalar(scalar)(a)
+
+
+
+

Element Wise Divide

+

The operation described here is an element-wise division of two tensors, a and b, where the operation can be described as f(a, b) = a / b.

+

We’ll compute the partial derivatives with respect to a and b:

+
    +
  1. The partial derivative of f(a, b) with respect to a (df/da) is 1/b.

  2. +
  3. The partial derivative of f(a, b) with respect to b (df/db) is -a / b^2.

  4. +
+

We are given a function \(f(a, b) = \frac{a}{b}\), where \(a\) and \(b\) are tensors. Our task is to find the partial derivatives of this function with respect to \(a\) and \(b\).

+

Let’s start with \(\frac{\partial f}{\partial a}\):

+

\[\begin{align*} +\frac{\partial f}{\partial a} &= \frac{\partial}{\partial a} \left(\frac{a}{b}\right) \\ +&= \frac{1}{b} +\end{align*}\]

+

Now, let’s compute \(\frac{\partial f}{\partial b}\):

+

\[\begin{align*} +\frac{\partial f}{\partial b} &= \frac{\partial}{\partial b} \left(\frac{a}{b}\right) \\ +&= - \frac{a}{b^{2}} +\end{align*}\]

+

Here is a detailed derivative:

+

Given a function of the form \(y = \frac{u}{v}\), where both \(u\) and \(v\) are functions of \(x\), the quotient rule of differentiation states:

+

\[\frac{dy}{dx} = \frac{v \cdot \frac{du}{dx} - u \cdot \frac{dv}{dx}}{v^2}\]

+

In our case, we’re looking at the function \(y = \frac{a}{b}\), where \(a\) and \(b\) are tensors. We want to find the derivative with respect to \(b\) (instead of \(x\) in our general formula). So we have:

+

\[\frac{dy}{db} = \frac{b \cdot \frac{da}{db} - a \cdot \frac{db}{db}}{b^2}\]

+

Since \(a\) does not depend on \(b\), \(\frac{da}{db} = 0\), and since any variable is equal to itself, \(\frac{db}{db} = 1\).

+

So the derivative \(\frac{dy}{db}\) simplifies to:

+

\[\frac{dy}{db} = \frac{b \cdot 0 - a \cdot 1}{b^2}\]

+

Therefore, the derivative of \(y\) with respect to \(b\) is \(-\frac{a}{b^2}\).

+

Therefore, the gradient of \(f(a, b)\) with respect to \(a\) is \(\frac{1}{b}\), and the gradient of \(f(a, b)\) with respect to \(b\) is \(- \frac{a}{b^{2}}\).

+
+
class EWiseDiv(TensorOp):
+    """
+    The EWiseDiv operation divides two tensors element-wise.
+
+    Example:
+        >>> import numpy as np
+        >>> a = Tensor(np.array([1, 2, 3]))
+        >>> b = Tensor(np.array([4, 5, 6]))
+        >>> div = EWiseDiv()
+        >>> result = div.compute(a.data, b.data)
+        >>> print(result)
+        array([0.25, 0.4, 0.5])
+
+    """
+
+    def compute(self, a: NDArray, b: NDArray) -> NDArray:
+        """
+        Computes the element-wise division of two tensors.
+
+        Args:
+            a (NDArray): The dividend tensor.
+            b (NDArray): The divisor tensor.
+
+        Returns:
+            NDArray: The resulting tensor after element-wise division.
+        """
+        return a / b
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Computes the gradient of the element-wise division operation.
+
+        Args:
+            out_grad (Tensor): The gradient of the output tensor.
+            node (Tensor): The node in the computational graph where the operation was performed.
+
+        Returns:
+            Tuple[Tensor, Tensor]: The gradients with respect to the dividend and divisor tensors.
+        """
+        a, b = node.inputs
+        return divide(out_grad, b), out_grad * negate(divide(a, power_scalar(b, 2)))
+
+
+def divide(a: Tensor, b: Tensor) -> Tensor:
+    """
+    Divides two tensors element-wise.
+
+    Args:
+        a (Tensor): The dividend tensor.
+        b (Tensor): The divisor tensor.
+
+    Returns:
+        Tensor: The resulting tensor after element-wise division.
+
+    Example:
+        >>> import numpy as np
+        >>> a = Tensor(np.array([1, 2, 3]))
+        >>> b = Tensor(np.array([4, 5, 6]))
+        >>> result = divide(a, b)
+        >>> print(result)
+        Tensor([0.25, 0.4, 0.5])
+    """
+    return EWiseDiv()(a, b)
+
+
+
+

Divide Scalar

+

Let’s denote the scalar as c, and a as the tensor being divided by the scalar. The operation can be described as f(a) = a / c.

+

The function for the backward pass (i.e., the gradient) is df/da = 1/c.

+

This is the derivative of f(a) with respect to a.

+

We are given a function \(f(a) = \frac{a}{c}\), where \(a\) is a tensor and \(c\) is a scalar. Our task is to find the derivative of this function with respect to \(a\).

+

By using the power rule of differentiation, where the derivative of \(a^n\) is \(n \cdot a^{n-1}\), we can rewrite \(f(a)\) as \(f(a) = c^{-1}a\).

+

Now, we can differentiate this with respect to \(a\):

+

\[\begin{align*} +\frac{df}{da} &= \frac{d}{da} (c^{-1}a) \\ +&= c^{-1} \frac{d}{da} (a) \\ +&= c^{-1} \\ +&= \frac{1}{c} +\end{align*}\]

+

Therefore, the gradient of \(f(a)\) with respect to \(a\) is \(\frac{1}{c}\).

+
+
class DivScalar(TensorOp):
+    """
+    The DivScalar operation divides a tensor by a scalar.
+
+    Example:
+        >>> import numpy as np
+        >>> a = Tensor(np.array([1, 2, 3]))
+        >>> scalar = 2
+        >>> div_scalar = DivScalar(scalar)
+        >>> result = div_scalar.compute(a.data)
+        >>> print(result)
+        array([0.5, 1.0, 1.5])
+
+    """
+
+    def __init__(self, scalar: Union[int, float]):
+        """
+        Initialize the DivScalar operation with the scalar to divide by.
+
+        Args:
+            scalar (int, float): The scalar to divide the tensor by.
+        """
+        self.scalar = scalar
+
+    def compute(self, a: NDArray) -> NDArray:
+        """
+        Divides the tensor by the scalar.
+
+        Args:
+            a (NDArray): The tensor to divide.
+
+        Returns:
+            NDArray: The resulting tensor after division.
+        """
+        return a / self.scalar
+
+    def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ...]:
+        """
+        Computes the gradient of the division operation.
+
+        Args:
+            out_grad (Tensor): The gradient of the output tensor.
+            node (Tensor): The node in the computational graph where the operation was performed.
+
+        Returns:
+            Tuple[Tensor, ...]: The gradient with respect to the tensor.
+        """
+        return (out_grad / self.scalar, )
+
+def divide_scalar(a: Tensor, scalar: Union[int, float]) -> Tensor:
+    """
+    Divides a tensor by a scalar.
+
+    Args:
+        a (Tensor): The tensor to divide.
+        scalar (int, float): The scalar to divide the tensor by.
+
+    Returns:
+        Tensor: The resulting tensor after division.
+
+    Example:
+        >>> import numpy as np
+        >>> a = Tensor(np.array([1, 2, 3]))
+        >>> scalar = 2
+        >>> result = divide_scalar(a, scalar)
+        >>> print(result)
+        Tensor([0.5, 1.0, 1.5])
+    """
+    return DivScalar(scalar)(a)
+
+
+
import nbdev; nbdev.nbdev_export()
+
diff --git a/_docs/search.json b/_docs/search.json index 618c46b..36e9ed5 100644 --- a/_docs/search.json +++ b/_docs/search.json @@ -4,7 +4,77 @@ "href": "operators.html", "title": "operators", "section": "", - "text": "source\n\nadd\n\n add (a, b)\n\n\nsource\n\n\nEWiseAdd\n\n EWiseAdd ()\n\nOp class specialized to output tensors, will be alternate subclasses for other structures\n\nsource\n\n\nadd_scalar\n\n add_scalar (a, scalar)\n\n\nsource\n\n\nAddScalar\n\n AddScalar (scalar)\n\nOp class specialized to output tensors, will be alternate subclasses for other structures\n\nsource\n\n\nmultiply\n\n multiply (a, b)\n\n\nsource\n\n\nEWiseMul\n\n EWiseMul ()\n\nOp class specialized to output tensors, will be alternate subclasses for other structures\n\nsource\n\n\nmul_scalar\n\n mul_scalar (a, scalar)\n\n\nsource\n\n\nMulScalar\n\n MulScalar (scalar)\n\nOp class specialized to output tensors, will be alternate subclasses for other structures" + "text": "The out_grad parameter refers to the gradient of the loss function with respect to the output of the node. Multiplying this with the local gradient gives the gradient of the loss with respect to the input to the node, according to the chain rule of calculus, which is the basis for backpropagation in neural networks.\nThe chain rule is a fundamental concept in calculus that provides a method to compute the derivative of composite functions. In simple terms, the chain rule states that the derivative of a composite function is the derivative of the outer function multiplied by the derivative of the inner function.\nGiven a composite function that is the composition of two functions, say, \\(f(g(x))\\), the chain rule can be stated as follows:\n\\[\\frac{df}{dx} = \\frac{df}{dg} \\cdot \\frac{dg}{dx}\\]\nWhere:\nThe chain rule can be extended to the case where we have more than two composite functions." + }, + { + "objectID": "operators.html#element-wise-addition", + "href": "operators.html#element-wise-addition", + "title": "operators", + "section": "Element Wise Addition", + "text": "Element Wise Addition\nLet’s walk through the step-by-step derivative calculation for the EWiseAdd operation:\nWe have the function f(a, b) = a + b, where a and b are tensors. Our goal is to compute the partial derivatives with respect to a and b.\nLet’s start by calculating the derivative of f with respect to a, denoted as df/da:\nStep 1: Compute the derivative of f with respect to a.\n\\(\\frac{{\\partial f}}{{\\partial a}} = \\frac{{\\partial}}{{\\partial a}} (a + b)\\)\nSince a is the variable we are differentiating with respect to, the derivative of a with respect to itself is 1:\n\\[\\frac{{\\partial f}}{{\\partial a}} = 1\\]\nTherefore, \\[\\frac{{\\partial f}}{{\\partial a}} = 1.\\]\nStep 2: Compute the derivative of f with respect to b.\n\\[\\frac{{\\partial f}}{{\\partial b}} = \\frac{{\\partial}}{{\\partial b}} (a + b)\\]\nAgain, since b is the variable we are differentiating with respect to, the derivative of b with respect to itself is 1:\n\\[\\frac{{\\partial f}}{{\\partial b}} = 1\\]\nTherefore, \\[\\frac{{\\partial f}}{{\\partial b}} = 1\\]\nHence, the partial derivatives of f(a, b) = a + b with respect to a and b are both equal to 1.\n\nsource\n\nadd\n\n add (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n\nAdds two tensors element-wise.\nArgs: - a: The first tensor. - b: The second tensor.\nReturns: The element-wise sum of a and b.\n\nsource\n\n\nEWiseAdd\n\n EWiseAdd ()\n\nPerforms element-wise addition of two tensors.\nExample: >>> a = Tensor([1, 2, 3]) >>> b = Tensor([4, 5, 6]) >>> op = EWiseAdd() >>> result = op.compute(a, b) >>> print(result) Tensor([5, 7, 9])\n\n# Create two 1-D tensors\na = Tensor([1, 2, 3])\nb = Tensor([4, 5, 6])\n\n# Create an EWiseAdd operation\nop = EWiseAdd()" + }, + { + "objectID": "operators.html#scalar-addition", + "href": "operators.html#scalar-addition", + "title": "operators", + "section": "Scalar Addition", + "text": "Scalar Addition\nExplanation for the derivative of the AddScalar operator:\nLet’s denote the scalar as c and a as the tensor being added by the scalar. The operation can be described as f(a) = a + c.\nThe function for the backward pass (i.e., the gradient) is df/da = 1, which means the derivative of f(a) with respect to a is simply 1.\nWe are given a function \\(f(a) = a + c\\), where \\(a\\) is a tensor and \\(c\\) is a scalar. Our task is to find the derivative of this function with respect to \\(a\\).\nBy differentiating the function \\(f(a)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (a + c) \\\\\n&= 1\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(1\\).\nWe starts by defining the function f(a) = a + c. It then explains that when we differentiate f(a) with respect to a, we find that the derivative is 1. This means that the gradient of f(a) with respect to a is 1, which matches the behavior of the AddScalar operator as provided in the gradient method.\n\nsource\n\nadd_scalar\n\n add_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n\nAdds a scalar to a tensor.\nArgs: - a: The tensor. - scalar: The scalar to add.\nReturns: The sum of a and the scalar.\n\nsource\n\n\nAddScalar\n\n AddScalar (scalar:Union[int,float])\n\nPerforms addition of a tensor and a scalar.\nExample: >>> a = Tensor([1, 2, 3]) >>> op = AddScalar(5) >>> result = op.compute(a) >>> print(result) Tensor([6, 7, 8])" + }, + { + "objectID": "operators.html#element-wise-multiplication", + "href": "operators.html#element-wise-multiplication", + "title": "operators", + "section": "Element Wise Multiplication", + "text": "Element Wise Multiplication\nExplanation for the derivative of the EWiseMul (element-wise multiplication) operator:\nLet’s denote the two input tensors as a and b. The operation can be described as f(a, b) = a * b, where * represents element-wise multiplication.\nThe function for the backward pass (i.e., the gradient) is df/da = b and df/db = a. This means that the derivative of f(a, b) with respect to a is b, and the derivative with respect to b is a.\nWe are given a function \\(f(a, b) = a \\odot b\\), where \\(a\\) and \\(b\\) are tensors, and \\(\\odot\\) represents element-wise multiplication. Our task is to find the derivatives of this function with respect to \\(a\\) and \\(b\\).\nBy differentiating the function \\(f(a, b)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (a \\odot b) \\\\\n&= b\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a, b)\\) with respect to \\(a\\) is \\(b\\).\nSimilarly, by differentiating the function \\(f(a, b)\\) with respect to \\(b\\), we find:\n\\[\\begin{align*}\n\\frac{df}{db} &= \\frac{d}{db} (a \\odot b) \\\\\n&= a\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a, b)\\) with respect to \\(b\\) is \\(a\\).\n\nsource\n\nmultiply\n\n multiply (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n\nMultiplies two tensors element-wise.\nArgs: - a: The first tensor. - b: The second tensor.\nReturns: The element-wise product of a and b.\n\nsource\n\n\nEWiseMul\n\n EWiseMul ()\n\nPerforms element-wise multiplication of two tensors.\nExample: >>> a = Tensor([1, 2, 3]) >>> b = Tensor([4, 5, 6]) >>> op = EWiseMul() >>> result = op.compute(a, b) >>> print(result) Tensor([4, 10, 18])" + }, + { + "objectID": "operators.html#scalar-multiplication", + "href": "operators.html#scalar-multiplication", + "title": "operators", + "section": "Scalar Multiplication", + "text": "Scalar Multiplication\nLet’s denote the scalar as c and a as the tensor being multiplied by the scalar. The operation can be described as f(a) = a * c.\nThe function for the backward pass (i.e., the gradient) is df/da = c, which means the derivative of f(a) with respect to a is c.\nThe LaTeX document will look as follows:\nWe are given a function \\(f(a) = a \\cdot c\\), where \\(a\\) is a tensor and \\(c\\) is a scalar. Our task is to find the derivative of this function with respect to \\(a\\).\nBy differentiating the function \\(f(a)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (a \\cdot c) \\\\\n&= c\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(c\\).\nWe starts by defining the function f(a) = a * c. It then explains that when we differentiate f(a) with respect to a, we find that the derivative is c. This means that the gradient of f(a) with respect to a is c, which matches the behavior of the MulScalar operator as provided in the gradient method.\n\nsource\n\nmul_scalar\n\n mul_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n\nMultiplies a tensor by a scalar.\nArgs: - a: The tensor. - scalar: The scalar to multiply.\nReturns: The product of a and the scalar.\n\nsource\n\n\nMulScalar\n\n MulScalar (scalar:Union[int,float])\n\nPerforms multiplication of a tensor and a scalar.\nExample: >>> a = Tensor([1, 2, 3]) >>> op = MulScalar(5) >>> result = op.compute(a) >>> print(result) Tensor([5, 10, 15])" + }, + { + "objectID": "operators.html#negation", + "href": "operators.html#negation", + "title": "operators", + "section": "Negation", + "text": "Negation\nLet’s denote a as the tensor being negated. The operation can be described as f(a) = -a.\nThe function for the backward pass (i.e., the gradient) is df/da = -1.\nWe are given a function \\(f(a) = -a\\), where \\(a\\) is a tensor. Our task is to find the derivative of this function with respect to \\(a\\).\nBy differentiating the function \\(f(a)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (-a) \\\\\n&= -1\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(-1\\).\n\nclass Negate(TensorOp):\n \"\"\"\n Negates the given tensor.\n \n Example:\n >>> a = Tensor([1, -2, 3])\n >>> op = Negate()\n >>> result = op.compute(a)\n >>> print(result)\n Tensor([-1, 2, -3])\n \"\"\"\n \n def compute(self, a: NDArray) -> NDArray:\n \"\"\"\n Computes the negation of a tensor.\n\n Args:\n - a: The tensor to negate.\n\n Returns:\n The negation of a.\n \"\"\"\n return -1 * a\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n \"\"\"\n Computes the gradient of the negation operation.\n\n Args:\n - out_grad: The gradient of the output of the operation.\n - node: The node in the computational graph where the operation was performed.\n\n Returns:\n The gradients with respect to the inputs.\n \"\"\"\n return (negate(out_grad), )\n\n\ndef negate(a: Tensor) -> Tensor:\n \"\"\"\n Negates the given tensor.\n\n Args:\n - a: The tensor to negate.\n\n Returns:\n The negation of a.\n \n Example:\n >>> a = Tensor([1, -2, 3])\n >>> result = negate(a)\n >>> print(result)\n Tensor([-1, 2, -3])\n \"\"\"\n return Negate()(a)" + }, + { + "objectID": "operators.html#exp", + "href": "operators.html#exp", + "title": "operators", + "section": "Exp", + "text": "Exp\nExplanation for the derivative of the Exp operator:\nLet’s denote a as the tensor on which the exponential function is applied. The operation can be described as f(a) = exp(a), where exp represents the exponential function.\nThe function for the backward pass (i.e., the gradient) is df/da = exp(a).\nWe are given a function \\(f(a) = \\exp(a)\\), where \\(a\\) is a tensor. Our task is to find the derivative of this function with respect to \\(a\\).\nBy differentiating the function \\(f(a)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (\\exp(a)) \\\\\n&= \\exp(a)\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(\\exp(a)\\).\n\nclass Exp(TensorOp):\n \"\"\"\n Calculates the exponential of the given tensor.\n \n Example:\n >>> a = Tensor([1, 2, 3])\n >>> op = Exp()\n >>> result = op.compute(a)\n >>> print(result)\n Tensor([2.71828183, 7.3890561, 20.08553692])\n \"\"\"\n \n def compute(self, a: NDArray) -> NDArray:\n \"\"\"\n Computes the exponential of a tensor.\n\n Args:\n - a: The tensor.\n\n Returns:\n The exponential of a.\n \"\"\"\n self.out = array_api.exp(a)\n return self.out\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n \"\"\"\n Computes the gradient of the exponential operation.\n\n Args:\n - out_grad: The gradient of the output of the operation.\n - node: The node in the computational graph where the operation was performed.\n\n Returns:\n The gradients with respect to the inputs.\n \"\"\"\n return (out_grad * self.out, )\n\ndef exp(a: Tensor) -> Tensor:\n \"\"\"\n Calculates the exponential of the given tensor.\n\n Args:\n - a: The tensor.\n\n Returns:\n The exponential of a.\n \n Example:\n >>> a = Tensor([1, 2, 3])\n >>> result = exp(a)\n >>> print(result)\n Tensor([2.71828183, 7.3890561, 20.08553692])\n \"\"\"\n return Exp()(a)" + }, + { + "objectID": "operators.html#relu", + "href": "operators.html#relu", + "title": "operators", + "section": "ReLU", + "text": "ReLU\nThe derivative of the ReLU (Rectified Linear Unit) operator:\nLet’s denote a as the tensor on which the ReLU function is applied. The ReLU function is defined as follows:\n\\[\nf(a) =\n\\begin{cases}\na, & \\text{if } a \\geq 0 \\\\\n0, & \\text{if } a < 0\n\\end{cases}\n\\]\nThe function for the backward pass (i.e., the gradient) is df/da = 1 if a >= 0, and df/da = 0 if a < 0.\nWe are given a function \\(f(a) = \\max(0, a)\\), where \\(a\\) is a tensor. Our task is to find the derivative of this function with respect to \\(a\\).\nBy considering the definition of the ReLU function, we can write \\(f(a)\\) as:\n\\[\nf(a) =\n\\begin{cases}\na, & \\text{if } a \\geq 0 \\\\\n0, & \\text{if } a < 0\n\\end{cases}\n\\]\nNow, let’s differentiate \\(f(a)\\) with respect to \\(a\\):\n\\[\n\\frac{df}{da} =\n\\begin{cases}\n1, & \\text{if } a \\geq 0 \\\\\n0, & \\text{if } a < 0\n\\end{cases}\n\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(1\\) if \\(a \\geq 0\\), and \\(0\\) if \\(a < 0\\).\n\nclass ReLU(TensorOp):\n \"\"\"\n Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n \n Example:\n >>> a = Tensor([1, -2, 3])\n >>> op = ReLU()\n >>> result = op.compute(a)\n >>> print(result)\n Tensor([1, 0, 3])\n \"\"\"\n \n def compute(self, a: NDArray) -> NDArray:\n \"\"\"\n Computes the ReLU activation function on a tensor.\n\n Args:\n - a: The tensor.\n\n Returns:\n The result of applying ReLU to a.\n \"\"\"\n self.out = array_api.clip(a, a_min=0)\n return self.out\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n \"\"\"\n Computes the gradient of the ReLU operation.\n\n Args:\n - out_grad: The gradient of the output of the operation.\n - node: The node in the computational graph where the operation was performed.\n\n Returns:\n The gradients with respect to the inputs.\n \"\"\"\n return (out_grad * Tensor(node.children[0] >= 0), )\n\ndef relu(a: Tensor) -> Tensor:\n \"\"\"\n Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n\n Args:\n - a: The tensor.\n\n Returns:\n The result of applying ReLU to a.\n \n Example:\n >>> a = Tensor([1, -2, 3])\n >>> result = relu(a)\n >>> print(result)\n Tensor([1, 0, 3])\n \"\"\"\n return ReLU()(a)" + }, + { + "objectID": "operators.html#power-scalar", + "href": "operators.html#power-scalar", + "title": "operators", + "section": "Power Scalar", + "text": "Power Scalar\nThe derivative of the PowerScalar operator:\nLet’s denote the scalar as n and a as the tensor being raised to the power of the scalar. The operation can be described as f(a) = a^n.\nThe function for the backward pass (i.e., the gradient) is df/da = n * a^(n-1).\nWe are given a function \\(f(a) = a^n\\), where \\(a\\) is a tensor and \\(n\\) is a scalar. Our task is to find the derivative of this function with respect to \\(a\\).\nBy differentiating the function \\(f(a)\\) with respect to \\(a\\), we find:\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (a^n) \\\\\n&= n \\cdot a^{n-1}\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(n \\cdot a^{n-1}\\).\n\nclass PowerScalar(TensorOp):\n \"\"\"\n The PowerScalar operation raises a tensor to an (integer) power.\n\n Attributes:\n scalar (int): The power to raise the tensor to.\n\n Example:\n >>> import numpy as np\n >>> tensor = Tensor(np.array([1, 2, 3]))\n >>> pow_scalar = PowerScalar(2)\n >>> result = pow_scalar.compute(tensor.data)\n >>> print(result)\n array([1, 4, 9])\n\n \"\"\"\n\n def __init__(self, scalar: int):\n \"\"\"\n Constructs the PowerScalar operation.\n\n Args:\n scalar (int): The power to raise the tensor to.\n \"\"\"\n self.scalar = scalar\n\n def compute(self, a: NDArray) -> NDArray:\n \"\"\"\n Computes the power operation on the input tensor.\n\n Args:\n a (NDArray): The input tensor.\n\n Returns:\n NDArray: The resulting tensor after the power operation.\n \"\"\"\n return array_api.power(a, self.scalar)\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ]:\n \"\"\"\n Computes the gradient of the power operation.\n\n Args:\n out_grad (Tensor): The gradient of the output tensor.\n node (Tensor): The node in the computational graph where the operation was performed.\n\n Returns:\n Tuple[Tensor, ]: The gradient with respect to the input tensor.\n \"\"\"\n a = node.children[0]\n return (self.scalar * power_scalar(a, self.scalar - 1) * out_grad, )\n\n\ndef power_scalar(a: Tensor, scalar: int) -> Tensor:\n \"\"\"\n Raises a tensor to a power.\n\n Args:\n a (Tensor): The input tensor.\n scalar (int): The power to raise the tensor to.\n\n Returns:\n Tensor: The resulting tensor after the power operation.\n\n Example:\n >>> import numpy as np\n >>> tensor = Tensor(np.array([1, 2, 3]))\n >>> result = power_scalar(tensor, 2)\n >>> print(result)\n Tensor([1, 4, 9])\n \"\"\"\n return PowerScalar(scalar)(a)" + }, + { + "objectID": "operators.html#element-wise-divide", + "href": "operators.html#element-wise-divide", + "title": "operators", + "section": "Element Wise Divide", + "text": "Element Wise Divide\nThe operation described here is an element-wise division of two tensors, a and b, where the operation can be described as f(a, b) = a / b.\nWe’ll compute the partial derivatives with respect to a and b:\n\nThe partial derivative of f(a, b) with respect to a (df/da) is 1/b.\nThe partial derivative of f(a, b) with respect to b (df/db) is -a / b^2.\n\nWe are given a function \\(f(a, b) = \\frac{a}{b}\\), where \\(a\\) and \\(b\\) are tensors. Our task is to find the partial derivatives of this function with respect to \\(a\\) and \\(b\\).\nLet’s start with \\(\\frac{\\partial f}{\\partial a}\\):\n\\[\\begin{align*}\n\\frac{\\partial f}{\\partial a} &= \\frac{\\partial}{\\partial a} \\left(\\frac{a}{b}\\right) \\\\\n&= \\frac{1}{b}\n\\end{align*}\\]\nNow, let’s compute \\(\\frac{\\partial f}{\\partial b}\\):\n\\[\\begin{align*}\n\\frac{\\partial f}{\\partial b} &= \\frac{\\partial}{\\partial b} \\left(\\frac{a}{b}\\right) \\\\\n&= - \\frac{a}{b^{2}}\n\\end{align*}\\]\nHere is a detailed derivative:\nGiven a function of the form \\(y = \\frac{u}{v}\\), where both \\(u\\) and \\(v\\) are functions of \\(x\\), the quotient rule of differentiation states:\n\\[\\frac{dy}{dx} = \\frac{v \\cdot \\frac{du}{dx} - u \\cdot \\frac{dv}{dx}}{v^2}\\]\nIn our case, we’re looking at the function \\(y = \\frac{a}{b}\\), where \\(a\\) and \\(b\\) are tensors. We want to find the derivative with respect to \\(b\\) (instead of \\(x\\) in our general formula). So we have:\n\\[\\frac{dy}{db} = \\frac{b \\cdot \\frac{da}{db} - a \\cdot \\frac{db}{db}}{b^2}\\]\nSince \\(a\\) does not depend on \\(b\\), \\(\\frac{da}{db} = 0\\), and since any variable is equal to itself, \\(\\frac{db}{db} = 1\\).\nSo the derivative \\(\\frac{dy}{db}\\) simplifies to:\n\\[\\frac{dy}{db} = \\frac{b \\cdot 0 - a \\cdot 1}{b^2}\\]\nTherefore, the derivative of \\(y\\) with respect to \\(b\\) is \\(-\\frac{a}{b^2}\\).\nTherefore, the gradient of \\(f(a, b)\\) with respect to \\(a\\) is \\(\\frac{1}{b}\\), and the gradient of \\(f(a, b)\\) with respect to \\(b\\) is \\(- \\frac{a}{b^{2}}\\).\n\nclass EWiseDiv(TensorOp):\n \"\"\"\n The EWiseDiv operation divides two tensors element-wise.\n\n Example:\n >>> import numpy as np\n >>> a = Tensor(np.array([1, 2, 3]))\n >>> b = Tensor(np.array([4, 5, 6]))\n >>> div = EWiseDiv()\n >>> result = div.compute(a.data, b.data)\n >>> print(result)\n array([0.25, 0.4, 0.5])\n\n \"\"\"\n\n def compute(self, a: NDArray, b: NDArray) -> NDArray:\n \"\"\"\n Computes the element-wise division of two tensors.\n\n Args:\n a (NDArray): The dividend tensor.\n b (NDArray): The divisor tensor.\n\n Returns:\n NDArray: The resulting tensor after element-wise division.\n \"\"\"\n return a / b\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]:\n \"\"\"\n Computes the gradient of the element-wise division operation.\n\n Args:\n out_grad (Tensor): The gradient of the output tensor.\n node (Tensor): The node in the computational graph where the operation was performed.\n\n Returns:\n Tuple[Tensor, Tensor]: The gradients with respect to the dividend and divisor tensors.\n \"\"\"\n a, b = node.inputs\n return divide(out_grad, b), out_grad * negate(divide(a, power_scalar(b, 2)))\n\n\ndef divide(a: Tensor, b: Tensor) -> Tensor:\n \"\"\"\n Divides two tensors element-wise.\n\n Args:\n a (Tensor): The dividend tensor.\n b (Tensor): The divisor tensor.\n\n Returns:\n Tensor: The resulting tensor after element-wise division.\n\n Example:\n >>> import numpy as np\n >>> a = Tensor(np.array([1, 2, 3]))\n >>> b = Tensor(np.array([4, 5, 6]))\n >>> result = divide(a, b)\n >>> print(result)\n Tensor([0.25, 0.4, 0.5])\n \"\"\"\n return EWiseDiv()(a, b)" + }, + { + "objectID": "operators.html#divide-scalar", + "href": "operators.html#divide-scalar", + "title": "operators", + "section": "Divide Scalar", + "text": "Divide Scalar\nLet’s denote the scalar as c, and a as the tensor being divided by the scalar. The operation can be described as f(a) = a / c.\nThe function for the backward pass (i.e., the gradient) is df/da = 1/c.\nThis is the derivative of f(a) with respect to a.\nWe are given a function \\(f(a) = \\frac{a}{c}\\), where \\(a\\) is a tensor and \\(c\\) is a scalar. Our task is to find the derivative of this function with respect to \\(a\\).\nBy using the power rule of differentiation, where the derivative of \\(a^n\\) is \\(n \\cdot a^{n-1}\\), we can rewrite \\(f(a)\\) as \\(f(a) = c^{-1}a\\).\nNow, we can differentiate this with respect to \\(a\\):\n\\[\\begin{align*}\n\\frac{df}{da} &= \\frac{d}{da} (c^{-1}a) \\\\\n&= c^{-1} \\frac{d}{da} (a) \\\\\n&= c^{-1} \\\\\n&= \\frac{1}{c}\n\\end{align*}\\]\nTherefore, the gradient of \\(f(a)\\) with respect to \\(a\\) is \\(\\frac{1}{c}\\).\n\nclass DivScalar(TensorOp):\n \"\"\"\n The DivScalar operation divides a tensor by a scalar.\n\n Example:\n >>> import numpy as np\n >>> a = Tensor(np.array([1, 2, 3]))\n >>> scalar = 2\n >>> div_scalar = DivScalar(scalar)\n >>> result = div_scalar.compute(a.data)\n >>> print(result)\n array([0.5, 1.0, 1.5])\n\n \"\"\"\n\n def __init__(self, scalar: Union[int, float]):\n \"\"\"\n Initialize the DivScalar operation with the scalar to divide by.\n\n Args:\n scalar (int, float): The scalar to divide the tensor by.\n \"\"\"\n self.scalar = scalar\n\n def compute(self, a: NDArray) -> NDArray:\n \"\"\"\n Divides the tensor by the scalar.\n\n Args:\n a (NDArray): The tensor to divide.\n\n Returns:\n NDArray: The resulting tensor after division.\n \"\"\"\n return a / self.scalar\n\n def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ...]:\n \"\"\"\n Computes the gradient of the division operation.\n\n Args:\n out_grad (Tensor): The gradient of the output tensor.\n node (Tensor): The node in the computational graph where the operation was performed.\n\n Returns:\n Tuple[Tensor, ...]: The gradient with respect to the tensor.\n \"\"\"\n return (out_grad / self.scalar, )\n\ndef divide_scalar(a: Tensor, scalar: Union[int, float]) -> Tensor:\n \"\"\"\n Divides a tensor by a scalar.\n\n Args:\n a (Tensor): The tensor to divide.\n scalar (int, float): The scalar to divide the tensor by.\n\n Returns:\n Tensor: The resulting tensor after division.\n\n Example:\n >>> import numpy as np\n >>> a = Tensor(np.array([1, 2, 3]))\n >>> scalar = 2\n >>> result = divide_scalar(a, scalar)\n >>> print(result)\n Tensor([0.5, 1.0, 1.5])\n \"\"\"\n return DivScalar(scalar)(a)\n\n\nimport nbdev; nbdev.nbdev_export()" }, { "objectID": "autograd.html", @@ -32,7 +102,7 @@ "href": "autograd.html#manual-gradient", "title": "autograd", "section": "Manual gradient", - "text": "Manual gradient\n\nbase case (L grad)\n\ndef lol():\n h = 0.001\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L'\n \n L1 = L.data\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L' \n \n L2 = L.data + h\n \n print(f'grad: {(L2 - L1) / h}')\n\nlol()\n\nsure enough it’s 1\n\nL.grad = 1\n\n\nf\nHere is a generic version of lol\n\ndef lol(label):\n def foo(v, label):\n if v.label == label: v.data += h\n \n h = 0.001\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L'\n \n L1 = L.data\n \n a = Value(2.0, label='a'); foo(a, label)\n b = Value(-3.0, label='b'); foo(b, label)\n c = Value(10.0, label='c'); foo(c, label)\n e = a*b; e.label='e'; foo(e, label)\n d = e + c; d.label='d'; foo(d, label)\n f = Value(-2.0, label='f'); foo(f, label)\n L = d*f; L.label='L'; foo(L, label) \n \n L2 = L.data\n \n print(f'grad: {(L2 - L1) / h}')\n\nlol('f')\n\n\nf.grad = 4\n\n\nlol('d')\n\n\nd.grad = -2\n\nLet’s draw what we have up to this point\n\ndraw_dot(L)\n\nSure, here’s the step by step derivation for each of the variables:\n\nWith respect to a:\n\nGiven that L = (a*b + c) * f, we will apply the product rule for differentiation.\nThe derivative of a*b with respect to a is b, and the derivative of c with respect to a is 0. Therefore:\n\\[\n\\frac{dL}{da} = f \\cdot \\frac{d(a*b + c)}{da} = f \\cdot (b + 0) = b \\cdot f\n\\]\n\nWith respect to b:\n\nThe derivative of a*b with respect to b is a, and the derivative of c with respect to b is 0. Therefore:\n\\[\n\\frac{dL}{db} = f \\cdot \\frac{d(a*b + c)}{db} = f \\cdot (a + 0) = a \\cdot f\n\\]\n\nWith respect to c:\n\nThe derivative of a*b with respect to c is 0, and the derivative of c with respect to c is 1. Therefore:\n\\[\n\\frac{dL}{dc} = f \\cdot \\frac{d(a*b + c)}{dc} = f \\cdot (0 + 1) = f\n\\]\n\nWith respect to f:\n\nThe derivative of (a*b + c) with respect to f is 0, and f is just f, therefore:\n\\[\n\\frac{dL}{df} = (a*b + c) \\cdot \\frac{df}{df} = a*b + c\n\\]\n\nWith respect to e (where e = a*b):\n\nThe derivative of e + c with respect to e is 1. Therefore:\n\\[\n\\frac{dL}{de} = f \\cdot \\frac{d(e + c)}{de} = f \\cdot 1 = f\n\\]\n\nWith respect to d (where d = e + c):\n\nThe derivative of d with respect to d is 1. Therefore:\n\\[\n\\frac{dL}{dd} = f \\cdot \\frac{df}{df} = f\n\\]\n\nlol('e')\n\n\ne.grad = -2 # 1 * d.grad\n\n\nlol('c')\n\n\nc.grad = -2 # 1 * d.grad\n\n\ndraw_dot(L)\n\n\nlol('a')\n\n\na.grad = 6 # b * e.grad\n\n\nlol('b')\n\n\nb.grad = -4 # a * e.grad\n\n\ndraw_dot(L)\n\n\nsource\n\n\n\nValue\n\n Value (data, _children=(), _op='', label='')\n\nRepresents a node within a computational graph.\nThis class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value’s dependencies, the operation that produced it, and whether it requires a gradient for backpropagation. It’s central to the functioning of automatic differentiation within deep learning frameworks.\nAttributes: op (Operator) _prev (Set[‘Value’]) cached_data (NDArray) requires_grad (bool)\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\nL.grad = 1\n\n\nL._backward()\n\n\ndraw_dot(L)\n\n\nd._backward()\n\n\ndraw_dot(L)\n\n\nc._backward()\n\nWe expect that nothing will happen\n\ndraw_dot(L)\n\n\ne._backward()\n\n\ndraw_dot(L)\n\nsure enough, exactly as we did before\nWe can do thid process automatically using topo sort algorithms, which’s will give us the correct order on which to call _backward on\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\n# topological order all of the children in the graph\ntopo = []\nvisited = set()\ndef build_topo(v):\n if v not in visited:\n visited.add(v)\n for child in v._prev:\n build_topo(child)\n topo.append(v)\n\nbuild_topo(L)\n\n\ntopo\n\n\n# go one variable at a time and apply the chain rule to get its gradient\nL.grad = 1\nfor v in reversed(topo):\n v._backward()\n\n\ndraw_dot(L)\n\nSo let’s now update the Value class with this logic\n\nsource\n\n\nValue\n\n Value (data, _children=(), _op='', label='')\n\nRepresents a node within a computational graph.\nThis class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value’s dependencies, the operation that produced it, and whether it requires a gradient for backpropagation. It’s central to the functioning of automatic differentiation within deep learning frameworks.\nAttributes: op (Operator) _prev (Set[‘Value’]) cached_data (NDArray) requires_grad (bool)\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\nL.backward()\n\n\ndraw_dot(L)\n\n\nsource\n\n\nValue\n\n Value (data, children=(), op='', label='')\n\nA class representing a scalar value and its gradient in a computational graph.\nAttributes: - data (float): the scalar value associated with this node - grad (float): the gradient of the output of the computational graph w.r.t. this node’s value - label (str): a label for this node, used for debugging and visualization purposes - _op (str): a string representation of the operation that produced this node in the computational graph - _prev (set of Value objects): the set of nodes that contributed to the computation of this node - _backward (function): a function that computes the gradients of this node w.r.t. its inputs\nMethods: - init(self, data, children=(), op=’‘, label=’’): Initializes a Value object with the given data, children, op, and label - repr(self): Returns a string representation of this Value object - add(self, other): Implements the addition operation between two Value objects - mul(self, other): Implements the multiplication operation between two Value objects - item(self): Returns the scalar value associated with this Value object - tanh(self): Applies the hyperbolic tangent function to this Value object and returns a new Value object\n\nsource\n\n\nall_devices\n\n all_devices ()\n\nreturn a list of all available devices\n\nsource\n\n\ncpu\n\n cpu ()\n\nReturn cpu device\n\nsource\n\n\nCPUDevice\n\n CPUDevice ()\n\nRepresents data that sits in CPU\n\nsource\n\n\nDevice\n\n Device ()\n\nIndicates the device supporting an NDArray.\n\nsource\n\n\nOperator\n\n Operator ()\n\nInitialize self. See help(type(self)) for accurate signature.\n\nsource\n\n\nTensorOp\n\n TensorOp ()\n\nOp class specialized to output tensors, will be alternate subclasses for other structures\n#| export\nclass Value:\n \"\"\"\n Represents a node within a computational graph.\n\n This class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value's dependencies, \n the operation that produced it, and whether it requires a gradient for backpropagation. It's central to the functioning of automatic \n differentiation within deep learning frameworks.\n\n Attributes:\n op (Operator)\n _prev (Set['Value']) \n cached_data (NDArray)\n requires_grad (bool)\n \"\"\"\n def __init__(self,\n op: Operator, # The operator that produced this node. If the node was initialized from actual data, this is 'None'.\n prev: Set['Value'], # The set of values that this value directly depends on. It's the union of the `_next` sets of all the values in `args`.\n cached_data: NDArray, # The actual data for this value. It's `None` for values that aren't yet computed.\n requires_grad: bool): # Specifies whether this node requires a gradient. This is `False` for nodes that don't need gradients.\n \n self._op = op\n self._prev = op\n self.cached_data = cached_data\n self.requires_grad = requires_grad\n\nsource\n\n\nTensor\n\n Tensor (array, device:Optional[__main__.Device]=None, dtype=None,\n requires_grad=True, **kwargs)\n\nA Tensor represents a multidimensional array of values in a computational graph.\nAttributes: - data: The actual data of the tensor. It is computed lazily. - children: Other tensors that this tensor depends on for computing its value. - requires_grad: Whether this tensor needs to compute gradients.\nMethods: - realize_data: Computes and returns the actual data for this tensor. - shape: Returns the shape of this tensor. - dtype: Returns the data type of this tensor.\nExample: >>> t1 = Tensor([[1.0, 2.0], [3.0, 4.0]]) >>> print(t1.shape) (2, 2) >>> print(t1.dtype) float64" + "text": "Manual gradient\n\nbase case (L grad)\n\ndef lol():\n h = 0.001\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L'\n \n L1 = L.data\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L' \n \n L2 = L.data + h\n \n print(f'grad: {(L2 - L1) / h}')\n\nlol()\n\nsure enough it’s 1\n\nL.grad = 1\n\n\nf\nHere is a generic version of lol\n\ndef lol(label):\n def foo(v, label):\n if v.label == label: v.data += h\n \n h = 0.001\n \n a = Value(2.0, label='a')\n b = Value(-3.0, label='b')\n c = Value(10.0, label='c')\n e = a*b; e.label='e'\n d = e + c; d.label='d'\n f = Value(-2.0, label='f')\n L = d*f; L.label='L'\n \n L1 = L.data\n \n a = Value(2.0, label='a'); foo(a, label)\n b = Value(-3.0, label='b'); foo(b, label)\n c = Value(10.0, label='c'); foo(c, label)\n e = a*b; e.label='e'; foo(e, label)\n d = e + c; d.label='d'; foo(d, label)\n f = Value(-2.0, label='f'); foo(f, label)\n L = d*f; L.label='L'; foo(L, label) \n \n L2 = L.data\n \n print(f'grad: {(L2 - L1) / h}')\n\nlol('f')\n\n\nf.grad = 4\n\n\nlol('d')\n\n\nd.grad = -2\n\nLet’s draw what we have up to this point\n\ndraw_dot(L)\n\nSure, here’s the step by step derivation for each of the variables:\n\nWith respect to a:\n\nGiven that L = (a*b + c) * f, we will apply the product rule for differentiation.\nThe derivative of a*b with respect to a is b, and the derivative of c with respect to a is 0. Therefore:\n\\[\n\\frac{dL}{da} = f \\cdot \\frac{d(a*b + c)}{da} = f \\cdot (b + 0) = b \\cdot f\n\\]\n\nWith respect to b:\n\nThe derivative of a*b with respect to b is a, and the derivative of c with respect to b is 0. Therefore:\n\\[\n\\frac{dL}{db} = f \\cdot \\frac{d(a*b + c)}{db} = f \\cdot (a + 0) = a \\cdot f\n\\]\n\nWith respect to c:\n\nThe derivative of a*b with respect to c is 0, and the derivative of c with respect to c is 1. Therefore:\n\\[\n\\frac{dL}{dc} = f \\cdot \\frac{d(a*b + c)}{dc} = f \\cdot (0 + 1) = f\n\\]\n\nWith respect to f:\n\nThe derivative of (a*b + c) with respect to f is 0, and f is just f, therefore:\n\\[\n\\frac{dL}{df} = (a*b + c) \\cdot \\frac{df}{df} = a*b + c\n\\]\n\nWith respect to e (where e = a*b):\n\nThe derivative of e + c with respect to e is 1. Therefore:\n\\[\n\\frac{dL}{de} = f \\cdot \\frac{d(e + c)}{de} = f \\cdot 1 = f\n\\]\n\nWith respect to d (where d = e + c):\n\nThe derivative of d with respect to d is 1. Therefore:\n\\[\n\\frac{dL}{dd} = f \\cdot \\frac{df}{df} = f\n\\]\n\nlol('e')\n\n\ne.grad = -2 # 1 * d.grad\n\n\nlol('c')\n\n\nc.grad = -2 # 1 * d.grad\n\n\ndraw_dot(L)\n\n\nlol('a')\n\n\na.grad = 6 # b * e.grad\n\n\nlol('b')\n\n\nb.grad = -4 # a * e.grad\n\n\ndraw_dot(L)\n\n\nsource\n\n\n\nValue\n\n Value (data, _children=(), _op='', label='')\n\nRepresents a node within a computational graph.\nThis class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value’s dependencies, the operation that produced it, and whether it requires a gradient for backpropagation. It’s central to the functioning of automatic differentiation within deep learning frameworks.\nAttributes: op (Operator) _prev (Set[‘Value’]) cached_data (NDArray) requires_grad (bool)\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\nL.grad = 1\n\n\nL._backward()\n\n\ndraw_dot(L)\n\n\nd._backward()\n\n\ndraw_dot(L)\n\n\nc._backward()\n\nWe expect that nothing will happen\n\ndraw_dot(L)\n\n\ne._backward()\n\n\ndraw_dot(L)\n\nsure enough, exactly as we did before\nWe can do thid process automatically using topo sort algorithms, which’s will give us the correct order on which to call _backward on\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\n# topological order all of the children in the graph\ntopo = []\nvisited = set()\ndef build_topo(v):\n if v not in visited:\n visited.add(v)\n for child in v._prev:\n build_topo(child)\n topo.append(v)\n\nbuild_topo(L)\n\n\ntopo\n\n\n# go one variable at a time and apply the chain rule to get its gradient\nL.grad = 1\nfor v in reversed(topo):\n v._backward()\n\n\ndraw_dot(L)\n\nSo let’s now update the Value class with this logic\n\nsource\n\n\nValue\n\n Value (data, _children=(), _op='', label='')\n\nRepresents a node within a computational graph.\nThis class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value’s dependencies, the operation that produced it, and whether it requires a gradient for backpropagation. It’s central to the functioning of automatic differentiation within deep learning frameworks.\nAttributes: op (Operator) _prev (Set[‘Value’]) cached_data (NDArray) requires_grad (bool)\n\na = Value(2.0, label='a')\nb = Value(-3.0, label='b')\nc = Value(10.0, label='c')\ne = a*b; e.label='e'\nd = e + c; d.label='d'\nf = Value(-2.0, label='f')\nL = d*f; L.label='L' \n\ndraw_dot(L)\n\n\nL.backward()\n\n\ndraw_dot(L)\n\n\nsource\n\n\nValue\n\n Value (data, children=(), op='', label='')\n\nA class representing a scalar value and its gradient in a computational graph.\nAttributes: - data (float): the scalar value associated with this node - grad (float): the gradient of the output of the computational graph w.r.t. this node’s value - label (str): a label for this node, used for debugging and visualization purposes - _op (str): a string representation of the operation that produced this node in the computational graph - _prev (set of Value objects): the set of nodes that contributed to the computation of this node - _backward (function): a function that computes the gradients of this node w.r.t. its inputs\nMethods: - init(self, data, children=(), op=’‘, label=’’): Initializes a Value object with the given data, children, op, and label - repr(self): Returns a string representation of this Value object - add(self, other): Implements the addition operation between two Value objects - mul(self, other): Implements the multiplication operation between two Value objects - item(self): Returns the scalar value associated with this Value object - tanh(self): Applies the hyperbolic tangent function to this Value object and returns a new Value object\n\nsource\n\n\nall_devices\n\n all_devices ()\n\nreturn a list of all available devices\n\nsource\n\n\ncpu\n\n cpu ()\n\nReturn cpu device\n\nsource\n\n\nCPUDevice\n\n CPUDevice ()\n\nRepresents data that sits in CPU\n\nsource\n\n\nDevice\n\n Device ()\n\nIndicates the device supporting an NDArray.\n\nsource\n\n\nOperator\n\n Operator ()\n\nInitialize self. See help(type(self)) for accurate signature.\n\nsource\n\n\nTensorOp\n\n TensorOp ()\n\nOp class specialized to output tensors, will be alternate subclasses for other structures\n#| export\nclass Value:\n \"\"\"\n Represents a node within a computational graph.\n\n This class encapsulates a single value and its relationships in the graph, making it easy to track and manage the value's dependencies, \n the operation that produced it, and whether it requires a gradient for backpropagation. It's central to the functioning of automatic \n differentiation within deep learning frameworks.\n\n Attributes:\n op (Operator)\n _prev (Set['Value']) \n cached_data (NDArray)\n requires_grad (bool)\n \"\"\"\n def __init__(self,\n op: Operator, # The operator that produced this node. If the node was initialized from actual data, this is 'None'.\n prev: Set['Value'], # The set of values that this value directly depends on. It's the union of the `_next` sets of all the values in `args`.\n cached_data: NDArray, # The actual data for this value. It's `None` for values that aren't yet computed.\n requires_grad: bool): # Specifies whether this node requires a gradient. This is `False` for nodes that don't need gradients.\n \n self._op = op\n self._prev = op\n self.cached_data = cached_data\n self.requires_grad = requires_grad\n\nsource\n\n\nTensor\n\n Tensor (array, device:Optional[__main__.Device]=None, dtype=None,\n requires_grad=True, **kwargs)\n\nA Tensor represents a multidimensional array of values in a computational graph.\nAttributes: - data: The actual data of the tensor. It is computed lazily. - children: Other tensors that this tensor depends on for computing its value. - requires_grad: Whether this tensor needs to compute gradients.\nMethods: - realize_data: Computes and returns the actual data for this tensor. - shape: Returns the shape of this tensor. - dtype: Returns the data type of this tensor.\nExample: >>> t1 = Tensor([[1.0, 2.0], [3.0, 4.0]]) >>> print(t1.shape) (2, 2) >>> print(t1.dtype) float64\n\nimport numpy as np\nimport unittest\nfrom minima.autograd import Tensor\n\nclass TestTensor(unittest.TestCase):\n \n def test_create_tensor(self):\n t1 = Tensor([1, 2, 3])\n self.assertTrue(np.array_equal(t1.realize_data(), np.array([1, 2, 3])))\n self.assertEqual(t1.shape, (3,))\n self.assertEqual(t1.dtype, np.float64)\n \n t2 = Tensor([[1, 2], [3, 4]])\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([[1, 2], [3, 4]])))\n self.assertEqual(t2.shape, (2, 2))\n self.assertEqual(t2.dtype, np.float64)\n \n t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n self.assertTrue(np.array_equal(t3.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n self.assertEqual(t3.shape, (3,))\n self.assertEqual(t3.dtype, np.int32)\n \n def test_create_tensor_from_tensor(self):\n t1 = Tensor([1, 2, 3])\n t2 = Tensor(t1)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3])))\n self.assertEqual(t2.shape, (3,))\n self.assertEqual(t2.dtype, np.float64)\n \n t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n t4 = Tensor(t3)\n self.assertTrue(np.array_equal(t4.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n self.assertEqual(t4.shape, (3,))\n self.assertEqual(t4.dtype, np.int32)\n \n def test_create_tensor_with_device(self):\n t1 = Tensor([1, 2, 3], device='cpu')\n self.assertEqual(t1.device, 'cpu')\n \n t2 = Tensor([1, 2, 3], device='cuda')\n self.assertEqual(t2.device, 'cuda')\n \n def test_create_tensor_with_requires_grad(self):\n t1 = Tensor([1, 2, 3], requires_grad=True)\n self.assertTrue(t1.requires_grad)\n \n t2 = Tensor([1, 2, 3], requires_grad=False)\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_with_kwargs(self):\n t1 = Tensor([1, 2, 3], device='cuda', dtype=np.float32, requires_grad=True)\n self.assertEqual(t1.device, 'cuda')\n self.assertEqual(t1.dtype, np.float32)\n self.assertTrue(t1.requires_grad)\n \n def test_create_tensor_from_numpy(self):\n np_array = np.array([1, 2, 3])\n t1 = Tensor(np_array)\n self.assertTrue(np.array_equal(t1.realize_data(), np_array))\n self.assertEqual(t1.shape, (3,))\n self.assertEqual(t1.dtype, np.float64)\n \n np_array = np.array([1, 2, 3], dtype=np.int32)\n t2 = Tensor(np_array)\n self.assertTrue(np.array_equal(t2.realize_data(), np_array))\n self.assertEqual(t2.shape, (3,))\n self.assertEqual(t2.dtype, np.int32)\n \n def test_create_tensor_from_numpy_with_device(self):\n np_array = np.array([1, 2, 3])\n t1 = Tensor(np_array, device='cuda')\n self.assertEqual(t1.device, 'cuda')\n \n np_array = np.array([1, 2, 3], dtype=np.int32)\n t2 = Tensor(np_array, device='cuda')\n self.assertEqual(t2.device, 'cuda')\n \n def test_create_tensor_from_numpy_with_requires_grad(self):\n np_array = np.array([1, 2, 3])\n t1 = Tensor(np_array, requires_grad=True)\n self.assertTrue(t1.requires_grad)\n \n np_array = np.array([1, 2, 3], dtype=np.int32)\n t2 = Tensor(np_array, requires_grad=False)\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_numpy_with_kwargs(self):\n np_array = np.array([1, 2, 3])\n t1 = Tensor(np_array, device='cuda', dtype=np.float32, requires_grad=True)\n self.assertEqual(t1.device, 'cuda')\n self.assertEqual(t1.dtype, np.float32)\n self.assertTrue(t1.requires_grad)\n \n def test_create_tensor_from_tensor_with_device(self):\n t1 = Tensor([1, 2, 3], device='cpu')\n t2 = Tensor(t1, device='cuda')\n self.assertEqual(t2.device, 'cuda')\n \n def test_create_tensor_from_tensor_with_requires_grad(self):\n t1 = Tensor([1, 2, 3], requires_grad=True)\n t2 = Tensor(t1, requires_grad=False)\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_kwargs(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n t2 = Tensor(t1, device='cuda', dtype=np.float64)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n t2 = Tensor(t1, device='cpu', dtype=np.float32)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n self.assertEqual(t2.device, 'cpu')\n self.assertEqual(t2.dtype, np.float32)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertTrue(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertFalse(t2.requires_grad)\n \n def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n self.assertEqual(t2.device, 'cuda')\n self.assertEqual(t2.dtype, np.float64)\n self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n self.assertTrue(t2.requires_grad)\n\n\nimport nbdev; nbdev.nbdev_export()" }, { "objectID": "index.html", diff --git a/_docs/sitemap.xml b/_docs/sitemap.xml index 1626f56..8db8d64 100644 --- a/_docs/sitemap.xml +++ b/_docs/sitemap.xml @@ -2,14 +2,14 @@ https://m0saan.github.io/minima/operators.html - 2023-06-02T03:54:01.371Z + 2023-06-02T08:10:27.560Z https://m0saan.github.io/minima/autograd.html - 2023-06-02T03:54:02.465Z + 2023-06-02T08:10:28.519Z https://m0saan.github.io/minima/index.html - 2023-06-02T03:54:03.293Z + 2023-06-02T08:10:29.219Z diff --git a/_proc/00_autograd.ipynb b/_proc/00_autograd.ipynb index f5645f2..6dd7d68 100644 --- a/_proc/00_autograd.ipynb +++ b/_proc/00_autograd.ipynb @@ -1161,7 +1161,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L484){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L508){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### all_devices\n", "\n", @@ -1172,7 +1172,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L484){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L508){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### all_devices\n", "\n", @@ -1202,7 +1202,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L480){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L504){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### cpu\n", "\n", @@ -1213,7 +1213,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L480){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L504){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### cpu\n", "\n", @@ -1243,7 +1243,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L465){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L489){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### CPUDevice\n", "\n", @@ -1254,7 +1254,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L465){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L489){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### CPUDevice\n", "\n", @@ -1284,7 +1284,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L461){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L485){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Device\n", "\n", @@ -1295,7 +1295,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L461){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L485){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Device\n", "\n", @@ -1325,7 +1325,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L489){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L513){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Operator\n", "\n", @@ -1336,7 +1336,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L489){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L513){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Operator\n", "\n", @@ -1366,7 +1366,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L501){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L525){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### TensorOp\n", "\n", @@ -1377,7 +1377,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L501){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L525){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### TensorOp\n", "\n", @@ -1440,7 +1440,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L508){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L532){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Tensor\n", "\n", @@ -1469,7 +1469,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L508){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/autograd.py#L532){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Tensor\n", "\n", @@ -1514,7 +1514,259 @@ "language": "python" }, "outputs": [], - "source": [] + "source": [ + "import numpy as np\n", + "import unittest\n", + "from minima.autograd import Tensor\n", + "\n", + "class TestTensor(unittest.TestCase):\n", + " \n", + " def test_create_tensor(self):\n", + " t1 = Tensor([1, 2, 3])\n", + " self.assertTrue(np.array_equal(t1.realize_data(), np.array([1, 2, 3])))\n", + " self.assertEqual(t1.shape, (3,))\n", + " self.assertEqual(t1.dtype, np.float64)\n", + " \n", + " t2 = Tensor([[1, 2], [3, 4]])\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([[1, 2], [3, 4]])))\n", + " self.assertEqual(t2.shape, (2, 2))\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " \n", + " t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n", + " self.assertTrue(np.array_equal(t3.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n", + " self.assertEqual(t3.shape, (3,))\n", + " self.assertEqual(t3.dtype, np.int32)\n", + " \n", + " def test_create_tensor_from_tensor(self):\n", + " t1 = Tensor([1, 2, 3])\n", + " t2 = Tensor(t1)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3])))\n", + " self.assertEqual(t2.shape, (3,))\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " \n", + " t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n", + " t4 = Tensor(t3)\n", + " self.assertTrue(np.array_equal(t4.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n", + " self.assertEqual(t4.shape, (3,))\n", + " self.assertEqual(t4.dtype, np.int32)\n", + " \n", + " def test_create_tensor_with_device(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu')\n", + " self.assertEqual(t1.device, 'cpu')\n", + " \n", + " t2 = Tensor([1, 2, 3], device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_with_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], requires_grad=True)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " t2 = Tensor([1, 2, 3], requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_with_kwargs(self):\n", + " t1 = Tensor([1, 2, 3], device='cuda', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t1.device, 'cuda')\n", + " self.assertEqual(t1.dtype, np.float32)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " def test_create_tensor_from_numpy(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array)\n", + " self.assertTrue(np.array_equal(t1.realize_data(), np_array))\n", + " self.assertEqual(t1.shape, (3,))\n", + " self.assertEqual(t1.dtype, np.float64)\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np_array))\n", + " self.assertEqual(t2.shape, (3,))\n", + " self.assertEqual(t2.dtype, np.int32)\n", + " \n", + " def test_create_tensor_from_numpy_with_device(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, device='cuda')\n", + " self.assertEqual(t1.device, 'cuda')\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array, device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_from_numpy_with_requires_grad(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, requires_grad=True)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array, requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_numpy_with_kwargs(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, device='cuda', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t1.device, 'cuda')\n", + " self.assertEqual(t1.dtype, np.float32)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_device(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu')\n", + " t2 = Tensor(t1, device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_from_tensor_with_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], requires_grad=True)\n", + " t2 = Tensor(t1, requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_kwargs(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)" + ] }, { "cell_type": "code", @@ -1568,7 +1820,9 @@ "language": "python" }, "outputs": [], - "source": [] + "source": [ + "import nbdev; nbdev.nbdev_export()" + ] }, { "cell_type": "code", diff --git a/_proc/01_operators.ipynb b/_proc/01_operators.ipynb index 7c70adc..007a3aa 100644 --- a/_proc/01_operators.ipynb +++ b/_proc/01_operators.ipynb @@ -20,6 +20,76 @@ "" ] }, + { + "cell_type": "markdown", + "id": "8c9a4956-5575-4950-9365-560225c3a715", + "metadata": {}, + "source": [ + "The `out_grad` parameter refers to the gradient of the loss function with respect to the output of the node. Multiplying this with the local gradient gives the gradient of the loss with respect to the input to the node, according to the chain rule of calculus, which is the basis for backpropagation in neural networks." + ] + }, + { + "cell_type": "markdown", + "id": "40c0867e-7744-4c76-95b6-da3868dc8625", + "metadata": {}, + "source": [ + "The chain rule is a fundamental concept in calculus that provides a method to compute the derivative of composite functions. In simple terms, the chain rule states that the derivative of a composite function is the derivative of the outer function multiplied by the derivative of the inner function.\n", + "\n", + "Given a composite function that is the composition of two functions, say, $f(g(x))$, the chain rule can be stated as follows:\n", + "\n", + "$$\\frac{df}{dx} = \\frac{df}{dg} \\cdot \\frac{dg}{dx}$$\n", + "\n", + "Where:\n", + "\n", + "- $\\frac{df}{dx}$ is the derivative of the composite function $f(g(x))$ with respect to $x$,\n", + "- $\\frac{df}{dg}$ is the derivative of the outer function $f$ with respect to its argument $g(x)$, and\n", + "- $\\frac{dg}{dx}$ is the derivative of the inner function $g(x)$ with respect to $x$.\n", + "\n", + "The chain rule can be extended to the case where we have more than two composite functions." + ] + }, + { + "cell_type": "markdown", + "id": "83cb2320-c471-426d-ad48-0197b1daecaa", + "metadata": {}, + "source": [ + "## Element Wise Addition" + ] + }, + { + "cell_type": "markdown", + "id": "feadcd15-44d4-4a3d-aef9-39b3b7f6fcd1", + "metadata": {}, + "source": [ + "Let's walk through the step-by-step derivative calculation for the [`EWiseAdd`](https://m0saan.github.io/minima/operators.html#ewiseadd) operation:\n", + "\n", + "We have the function `f(a, b) = a + b`, where `a` and `b` are tensors. Our goal is to compute the partial derivatives with respect to `a` and `b`.\n", + "\n", + "Let's start by calculating the derivative of `f` with respect to `a`, denoted as `df/da`:\n", + "\n", + "Step 1: Compute the derivative of `f` with respect to `a`.\n", + "\n", + "$\\frac{{\\partial f}}{{\\partial a}} = \\frac{{\\partial}}{{\\partial a}} (a + b)$\n", + "\n", + "Since `a` is the variable we are differentiating with respect to, the derivative of `a` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial a}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial a}} = 1.$$\n", + "\n", + "Step 2: Compute the derivative of `f` with respect to `b`.\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = \\frac{{\\partial}}{{\\partial b}} (a + b)$$\n", + "\n", + "Again, since `b` is the variable we are differentiating with respect to, the derivative of `b` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Hence, the partial derivatives of `f(a, b) = a + b` with respect to `a` and `b` are both equal to 1." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -30,20 +100,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L30){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L61){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### add\n", "\n", - "> add (a, b)" + "> add (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n", + "\n", + "Adds two tensors element-wise.\n", + "\n", + "Args:\n", + "- a: The first tensor.\n", + "- b: The second tensor.\n", + "\n", + "Returns:\n", + "The element-wise sum of a and b." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L30){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L61){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### add\n", "\n", - "> add (a, b)" + "> add (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n", + "\n", + "Adds two tensors element-wise.\n", + "\n", + "Args:\n", + "- a: The first tensor.\n", + "- b: The second tensor.\n", + "\n", + "Returns:\n", + "The element-wise sum of a and b." ] }, "execution_count": 1, @@ -73,7 +161,15 @@ "\n", "> EWiseAdd ()\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs element-wise addition of two tensors.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> b = Tensor([4, 5, 6])\n", + ">>> op = EWiseAdd()\n", + ">>> result = op.compute(a, b)\n", + ">>> print(result)\n", + "Tensor([5, 7, 9])" ], "text/plain": [ "---\n", @@ -84,7 +180,15 @@ "\n", "> EWiseAdd ()\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs element-wise addition of two tensors.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> b = Tensor([4, 5, 6])\n", + ">>> op = EWiseAdd()\n", + ">>> result = op.compute(a, b)\n", + ">>> print(result)\n", + "Tensor([5, 7, 9])" ] }, "execution_count": 2, @@ -98,6 +202,57 @@ "show_doc(EWiseAdd)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "358fcf64-9c44-4d44-8374-a0ef11668d6e", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "# Create two 1-D tensors\n", + "a = Tensor([1, 2, 3])\n", + "b = Tensor([4, 5, 6])\n", + "\n", + "# Create an EWiseAdd operation\n", + "op = EWiseAdd()" + ] + }, + { + "cell_type": "markdown", + "id": "bd371e81-b6e4-43da-9987-30057c5c038a", + "metadata": {}, + "source": [ + "## Scalar Addition" + ] + }, + { + "cell_type": "markdown", + "id": "b2dbc8dc-25ae-4793-9cc8-cdbef59a8400", + "metadata": {}, + "source": [ + "Explanation for the derivative of the [`AddScalar`](https://m0saan.github.io/minima/operators.html#addscalar) operator:\n", + "\n", + "Let's denote the scalar as `c` and `a` as the tensor being added by the scalar. The operation can be described as `f(a) = a + c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1`, which means the derivative of `f(a)` with respect to `a` is simply `1`.\n", + "\n", + "We are given a function $f(a) = a + c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a + c) \\\\\n", + "&= 1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$.\n", + "\n", + "\n", + "We starts by defining the function `f(a) = a + c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `1`. This means that the gradient of `f(a)` with respect to `a` is `1`, which matches the behavior of the [`AddScalar`](https://m0saan.github.io/minima/operators.html#addscalar) operator as provided in the `gradient` method." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -108,20 +263,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L45){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L120){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### add_scalar\n", "\n", - "> add_scalar (a, scalar)" + "> add_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n", + "\n", + "Adds a scalar to a tensor.\n", + "\n", + "Args:\n", + "- a: The tensor.\n", + "- scalar: The scalar to add.\n", + "\n", + "Returns:\n", + "The sum of a and the scalar." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L45){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L120){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### add_scalar\n", "\n", - "> add_scalar (a, scalar)" + "> add_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n", + "\n", + "Adds a scalar to a tensor.\n", + "\n", + "Args:\n", + "- a: The tensor.\n", + "- scalar: The scalar to add.\n", + "\n", + "Returns:\n", + "The sum of a and the scalar." ] }, "execution_count": 3, @@ -145,24 +318,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L34){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L75){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AddScalar\n", "\n", - "> AddScalar (scalar)\n", + "> AddScalar (scalar:Union[int,float])\n", + "\n", + "Performs addition of a tensor and a scalar.\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> op = AddScalar(5)\n", + ">>> result = op.compute(a)\n", + ">>> print(result)\n", + "Tensor([6, 7, 8])" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L34){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L75){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AddScalar\n", "\n", - "> AddScalar (scalar)\n", + "> AddScalar (scalar:Union[int,float])\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs addition of a tensor and a scalar.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> op = AddScalar(5)\n", + ">>> result = op.compute(a)\n", + ">>> print(result)\n", + "Tensor([6, 7, 8])" ] }, "execution_count": 4, @@ -176,6 +363,47 @@ "show_doc(AddScalar)" ] }, + { + "cell_type": "markdown", + "id": "f26ed99f-b3f2-4df1-9918-ff23fc99be74", + "metadata": {}, + "source": [ + "## Element Wise Multiplication" + ] + }, + { + "cell_type": "markdown", + "id": "1bf5a7d0-8e8a-47cd-a4b5-cdc12a03649c", + "metadata": {}, + "source": [ + "Explanation for the derivative of the [`EWiseMul`](https://m0saan.github.io/minima/operators.html#ewisemul) (element-wise multiplication) operator:\n", + "\n", + "Let's denote the two input tensors as `a` and `b`. The operation can be described as `f(a, b) = a * b`, where `*` represents element-wise multiplication.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = b` and `df/db = a`. This means that the derivative of `f(a, b)` with respect to `a` is `b`, and the derivative with respect to `b` is `a`.\n", + "\n", + "\n", + "We are given a function $f(a, b) = a \\odot b$, where $a$ and $b$ are tensors, and $\\odot$ represents element-wise multiplication. Our task is to find the derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "By differentiating the function $f(a, b)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\odot b) \\\\\n", + "&= b\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $b$.\n", + "\n", + "Similarly, by differentiating the function $f(a, b)$ with respect to $b$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{db} &= \\frac{d}{db} (a \\odot b) \\\\\n", + "&= a\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $b$ is $a$." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -186,20 +414,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L58){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L173){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### multiply\n", "\n", - "> multiply (a, b)" + "> multiply (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n", + "\n", + "Multiplies two tensors element-wise.\n", + "\n", + "Args:\n", + "- a: The first tensor.\n", + "- b: The second tensor.\n", + "\n", + "Returns:\n", + "The element-wise product of a and b." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L58){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L173){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### multiply\n", "\n", - "> multiply (a, b)" + "> multiply (a:minima.autograd.Tensor, b:minima.autograd.Tensor)\n", + "\n", + "Multiplies two tensors element-wise.\n", + "\n", + "Args:\n", + "- a: The first tensor.\n", + "- b: The second tensor.\n", + "\n", + "Returns:\n", + "The element-wise product of a and b." ] }, "execution_count": 5, @@ -223,24 +469,40 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L49){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L134){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### EWiseMul\n", "\n", "> EWiseMul ()\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs element-wise multiplication of two tensors.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> b = Tensor([4, 5, 6])\n", + ">>> op = EWiseMul()\n", + ">>> result = op.compute(a, b)\n", + ">>> print(result)\n", + "Tensor([4, 10, 18])" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L49){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L134){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### EWiseMul\n", "\n", "> EWiseMul ()\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs element-wise multiplication of two tensors.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> b = Tensor([4, 5, 6])\n", + ">>> op = EWiseMul()\n", + ">>> result = op.compute(a, b)\n", + ">>> print(result)\n", + "Tensor([4, 10, 18])" ] }, "execution_count": 6, @@ -254,6 +516,39 @@ "show_doc(EWiseMul)" ] }, + { + "cell_type": "markdown", + "id": "cdb531c5-f22c-40c9-901e-e373b837a846", + "metadata": {}, + "source": [ + "## Scalar Multiplication" + ] + }, + { + "cell_type": "markdown", + "id": "5d0246b9-ce8f-4fab-991d-7ec43745c2ea", + "metadata": {}, + "source": [ + "Let's denote the scalar as `c` and `a` as the tensor being multiplied by the scalar. The operation can be described as `f(a) = a * c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = c`, which means the derivative of `f(a)` with respect to `a` is `c`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "We are given a function $f(a) = a \\cdot c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\cdot c) \\\\\n", + "&= c\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $c$.\n", + "\n", + "We starts by defining the function `f(a) = a * c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `c`. This means that the gradient of `f(a)` with respect to `a` is `c`, which matches the behavior of the [`MulScalar`](https://m0saan.github.io/minima/operators.html#mulscalar) operator as provided in the `gradient` method." + ] + }, { "cell_type": "code", "execution_count": 7, @@ -264,20 +559,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L72){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L232){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### mul_scalar\n", "\n", - "> mul_scalar (a, scalar)" + "> mul_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n", + "\n", + "Multiplies a tensor by a scalar.\n", + "\n", + "Args:\n", + "- a: The tensor.\n", + "- scalar: The scalar to multiply.\n", + "\n", + "Returns:\n", + "The product of a and the scalar." ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L72){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L232){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### mul_scalar\n", "\n", - "> mul_scalar (a, scalar)" + "> mul_scalar (a:minima.autograd.Tensor, scalar:Union[int,float])\n", + "\n", + "Multiplies a tensor by a scalar.\n", + "\n", + "Args:\n", + "- a: The tensor.\n", + "- scalar: The scalar to multiply.\n", + "\n", + "Returns:\n", + "The product of a and the scalar." ] }, "execution_count": 7, @@ -301,24 +614,38 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L187){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### MulScalar\n", "\n", - "> MulScalar (scalar)\n", + "> MulScalar (scalar:Union[int,float])\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Performs multiplication of a tensor and a scalar.\n", + "\n", + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> op = MulScalar(5)\n", + ">>> result = op.compute(a)\n", + ">>> print(result)\n", + "Tensor([5, 10, 15])" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/m0saan/minima/blob/main/minima/operators.py#L187){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### MulScalar\n", "\n", - "> MulScalar (scalar)\n", + "> MulScalar (scalar:Union[int,float])\n", + "\n", + "Performs multiplication of a tensor and a scalar.\n", "\n", - "Op class specialized to output tensors, will be alternate subclasses for other structures" + "Example:\n", + ">>> a = Tensor([1, 2, 3])\n", + ">>> op = MulScalar(5)\n", + ">>> result = op.compute(a)\n", + ">>> print(result)\n", + "Tensor([5, 10, 15])" ] }, "execution_count": 8, @@ -332,15 +659,688 @@ "show_doc(MulScalar)" ] }, + { + "cell_type": "markdown", + "id": "6497d8ab-3003-4784-a330-ad5f862b9ca5", + "metadata": {}, + "source": [ + "## Negation" + ] + }, + { + "cell_type": "markdown", + "id": "04085053-1446-4343-b720-17e04a1c4ee1", + "metadata": {}, + "source": [ + "Let's denote `a` as the tensor being negated. The operation can be described as `f(a) = -a`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = -1`.\n", + "\n", + "We are given a function $f(a) = -a$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (-a) \\\\\n", + "&= -1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $-1$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "defd870b-d2e6-4212-bd4b-b3333d271c9e", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "class Negate(TensorOp):\n", + " \"\"\"\n", + " Negates the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> op = Negate()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([-1, 2, -3])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the negation of a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor to negate.\n", + "\n", + " Returns:\n", + " The negation of a.\n", + " \"\"\"\n", + " return -1 * a\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the negation operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (negate(out_grad), )\n", + "\n", + "\n", + "def negate(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Negates the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor to negate.\n", + "\n", + " Returns:\n", + " The negation of a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> result = negate(a)\n", + " >>> print(result)\n", + " Tensor([-1, 2, -3])\n", + " \"\"\"\n", + " return Negate()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "2e28e521-a653-45e0-a567-46c7b800d281", + "metadata": {}, + "source": [ + "## Exp" + ] + }, + { + "cell_type": "markdown", + "id": "af8bf43a-9a2a-4077-a54f-54df0c6c955d", + "metadata": {}, + "source": [ + "Explanation for the derivative of the `Exp` operator:\n", + "\n", + "Let's denote `a` as the tensor on which the exponential function is applied. The operation can be described as `f(a) = exp(a)`, where `exp` represents the exponential function.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = exp(a)`.\n", + "\n", + "We are given a function $f(a) = \\exp(a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (\\exp(a)) \\\\\n", + "&= \\exp(a)\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\exp(a)$." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "a1cb206c-0218-4701-a3bf-eda4ae77788b", + "id": "a810bf4e-1d48-412b-bf89-2a6fde789ae4", "metadata": { "language": "python" }, "outputs": [], - "source": [] + "source": [ + "class Exp(TensorOp):\n", + " \"\"\"\n", + " Calculates the exponential of the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, 2, 3])\n", + " >>> op = Exp()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([2.71828183, 7.3890561, 20.08553692])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the exponential of a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The exponential of a.\n", + " \"\"\"\n", + " self.out = array_api.exp(a)\n", + " return self.out\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the exponential operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (out_grad * self.out, )\n", + "\n", + "def exp(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Calculates the exponential of the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The exponential of a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, 2, 3])\n", + " >>> result = exp(a)\n", + " >>> print(result)\n", + " Tensor([2.71828183, 7.3890561, 20.08553692])\n", + " \"\"\"\n", + " return Exp()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "45592e37-9a6d-42ec-8458-167a94394cc1", + "metadata": {}, + "source": [ + "## ReLU" + ] + }, + { + "cell_type": "markdown", + "id": "2c8aefd6-23e2-4df7-8627-f74813b8f0bc", + "metadata": {}, + "source": [ + "The derivative of the `ReLU` (Rectified Linear Unit) operator:\n", + "\n", + "Let's denote `a` as the tensor on which the ReLU function is applied. The ReLU function is defined as follows: \n", + "\n", + "$$\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1` if `a >= 0`, and `df/da = 0` if `a < 0`.\n", + "\n", + "We are given a function $f(a) = \\max(0, a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By considering the definition of the ReLU function, we can write $f(a)$ as:\n", + "\n", + "$$\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Now, let's differentiate $f(a)$ with respect to $a$:\n", + "\n", + "$$\n", + "\\frac{df}{da} = \n", + "\\begin{cases}\n", + "1, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$ if $a \\geq 0$, and $0$ if $a < 0$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f32b2ce4-7d9e-4290-90d9-a56bc59cfe2a", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "class ReLU(TensorOp):\n", + " \"\"\"\n", + " Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> op = ReLU()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([1, 0, 3])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the ReLU activation function on a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The result of applying ReLU to a.\n", + " \"\"\"\n", + " self.out = array_api.clip(a, a_min=0)\n", + " return self.out\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the ReLU operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (out_grad * Tensor(node.children[0] >= 0), )\n", + "\n", + "def relu(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The result of applying ReLU to a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> result = relu(a)\n", + " >>> print(result)\n", + " Tensor([1, 0, 3])\n", + " \"\"\"\n", + " return ReLU()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "8e9f074a-e23c-4e8a-8ab7-2cfba344c461", + "metadata": {}, + "source": [ + "## Power Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "40d5e875-0981-4243-86b5-d6ca6b117d5e", + "metadata": {}, + "source": [ + "The derivative of the `PowerScalar` operator:\n", + "\n", + "Let's denote the scalar as `n` and `a` as the tensor being raised to the power of the scalar. The operation can be described as `f(a) = a^n`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = n * a^(n-1)`.\n", + "\n", + "We are given a function $f(a) = a^n$, where $a$ is a tensor and $n$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a^n) \\\\\n", + "&= n \\cdot a^{n-1}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $n \\cdot a^{n-1}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "364103cb-615b-4359-8061-5a8bd1455367", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "class PowerScalar(TensorOp):\n", + " \"\"\"\n", + " The PowerScalar operation raises a tensor to an (integer) power.\n", + "\n", + " Attributes:\n", + " scalar (int): The power to raise the tensor to.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> tensor = Tensor(np.array([1, 2, 3]))\n", + " >>> pow_scalar = PowerScalar(2)\n", + " >>> result = pow_scalar.compute(tensor.data)\n", + " >>> print(result)\n", + " array([1, 4, 9])\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, scalar: int):\n", + " \"\"\"\n", + " Constructs the PowerScalar operation.\n", + "\n", + " Args:\n", + " scalar (int): The power to raise the tensor to.\n", + " \"\"\"\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the power operation on the input tensor.\n", + "\n", + " Args:\n", + " a (NDArray): The input tensor.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after the power operation.\n", + " \"\"\"\n", + " return array_api.power(a, self.scalar)\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ]:\n", + " \"\"\"\n", + " Computes the gradient of the power operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, ]: The gradient with respect to the input tensor.\n", + " \"\"\"\n", + " a = node.children[0]\n", + " return (self.scalar * power_scalar(a, self.scalar - 1) * out_grad, )\n", + "\n", + "\n", + "def power_scalar(a: Tensor, scalar: int) -> Tensor:\n", + " \"\"\"\n", + " Raises a tensor to a power.\n", + "\n", + " Args:\n", + " a (Tensor): The input tensor.\n", + " scalar (int): The power to raise the tensor to.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after the power operation.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> tensor = Tensor(np.array([1, 2, 3]))\n", + " >>> result = power_scalar(tensor, 2)\n", + " >>> print(result)\n", + " Tensor([1, 4, 9])\n", + " \"\"\"\n", + " return PowerScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "0cbb40d3-1ed4-4a7f-9d74-b39b70187860", + "metadata": {}, + "source": [ + "## Element Wise Divide" + ] + }, + { + "cell_type": "markdown", + "id": "24a7c7e2-d71d-49c0-9c45-ffc7f0830334", + "metadata": {}, + "source": [ + "The operation described here is an element-wise division of two tensors, `a` and `b`, where the operation can be described as `f(a, b) = a / b`. \n", + "\n", + "We'll compute the partial derivatives with respect to `a` and `b`:\n", + "\n", + "1. The partial derivative of `f(a, b)` with respect to `a` (`df/da`) is `1/b`.\n", + "\n", + "2. The partial derivative of `f(a, b)` with respect to `b` (`df/db`) is `-a / b^2`.\n", + "\n", + "We are given a function $f(a, b) = \\frac{a}{b}$, where $a$ and $b$ are tensors. Our task is to find the partial derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "Let's start with $\\frac{\\partial f}{\\partial a}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial a} &= \\frac{\\partial}{\\partial a} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= \\frac{1}{b}\n", + "\\end{align*}\n", + "\n", + "Now, let's compute $\\frac{\\partial f}{\\partial b}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial b} &= \\frac{\\partial}{\\partial b} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= - \\frac{a}{b^{2}}\n", + "\\end{align*}\n", + "\n", + "Here is a detailed derivative:\n", + "\n", + "Given a function of the form $y = \\frac{u}{v}$, where both $u$ and $v$ are functions of $x$, the quotient rule of differentiation states:\n", + "\n", + "$$\\frac{dy}{dx} = \\frac{v \\cdot \\frac{du}{dx} - u \\cdot \\frac{dv}{dx}}{v^2}$$\n", + "\n", + "In our case, we're looking at the function $y = \\frac{a}{b}$, where $a$ and $b$ are tensors. We want to find the derivative with respect to $b$ (instead of $x$ in our general formula). So we have:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot \\frac{da}{db} - a \\cdot \\frac{db}{db}}{b^2}$$\n", + "\n", + "Since $a$ does not depend on $b$, $\\frac{da}{db} = 0$, and since any variable is equal to itself, $\\frac{db}{db} = 1$. \n", + "\n", + "So the derivative $\\frac{dy}{db}$ simplifies to:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot 0 - a \\cdot 1}{b^2}$$\n", + "\n", + "Therefore, the derivative of $y$ with respect to $b$ is $-\\frac{a}{b^2}$.\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $\\frac{1}{b}$, and the gradient of $f(a, b)$ with respect to $b$ is $- \\frac{a}{b^{2}}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d841c0bd-e4ad-4dd8-add8-0a4626a774fe", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "class EWiseDiv(TensorOp):\n", + " \"\"\"\n", + " The EWiseDiv operation divides two tensors element-wise.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> b = Tensor(np.array([4, 5, 6]))\n", + " >>> div = EWiseDiv()\n", + " >>> result = div.compute(a.data, b.data)\n", + " >>> print(result)\n", + " array([0.25, 0.4, 0.5])\n", + "\n", + " \"\"\"\n", + "\n", + " def compute(self, a: NDArray, b: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the element-wise division of two tensors.\n", + "\n", + " Args:\n", + " a (NDArray): The dividend tensor.\n", + " b (NDArray): The divisor tensor.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after element-wise division.\n", + " \"\"\"\n", + " return a / b\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]:\n", + " \"\"\"\n", + " Computes the gradient of the element-wise division operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, Tensor]: The gradients with respect to the dividend and divisor tensors.\n", + " \"\"\"\n", + " a, b = node.inputs\n", + " return divide(out_grad, b), out_grad * negate(divide(a, power_scalar(b, 2)))\n", + "\n", + "\n", + "def divide(a: Tensor, b: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Divides two tensors element-wise.\n", + "\n", + " Args:\n", + " a (Tensor): The dividend tensor.\n", + " b (Tensor): The divisor tensor.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after element-wise division.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> b = Tensor(np.array([4, 5, 6]))\n", + " >>> result = divide(a, b)\n", + " >>> print(result)\n", + " Tensor([0.25, 0.4, 0.5])\n", + " \"\"\"\n", + " return EWiseDiv()(a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "2f7daf46-5e14-4bf7-9f7f-81947e7e7cd3", + "metadata": {}, + "source": [ + "## Divide Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "87d98d2f-34a5-4744-994a-9c9264bfb4a9", + "metadata": {}, + "source": [ + "Let's denote the scalar as `c`, and `a` as the tensor being divided by the scalar. The operation can be described as `f(a) = a / c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1/c`.\n", + "\n", + "This is the derivative of `f(a)` with respect to `a`.\n", + "\n", + "We are given a function $f(a) = \\frac{a}{c}$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By using the power rule of differentiation, where the derivative of $a^n$ is $n \\cdot a^{n-1}$, we can rewrite $f(a)$ as $f(a) = c^{-1}a$. \n", + "\n", + "Now, we can differentiate this with respect to $a$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (c^{-1}a) \\\\\n", + "&= c^{-1} \\frac{d}{da} (a) \\\\\n", + "&= c^{-1} \\\\\n", + "&= \\frac{1}{c}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\frac{1}{c}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbceaf89-d19a-43e2-9ce2-3a9c093612ca", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "class DivScalar(TensorOp):\n", + " \"\"\"\n", + " The DivScalar operation divides a tensor by a scalar.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> scalar = 2\n", + " >>> div_scalar = DivScalar(scalar)\n", + " >>> result = div_scalar.compute(a.data)\n", + " >>> print(result)\n", + " array([0.5, 1.0, 1.5])\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, scalar: Union[int, float]):\n", + " \"\"\"\n", + " Initialize the DivScalar operation with the scalar to divide by.\n", + "\n", + " Args:\n", + " scalar (int, float): The scalar to divide the tensor by.\n", + " \"\"\"\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Divides the tensor by the scalar.\n", + "\n", + " Args:\n", + " a (NDArray): The tensor to divide.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after division.\n", + " \"\"\"\n", + " return a / self.scalar\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ...]:\n", + " \"\"\"\n", + " Computes the gradient of the division operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, ...]: The gradient with respect to the tensor.\n", + " \"\"\"\n", + " return (out_grad / self.scalar, )\n", + "\n", + "def divide_scalar(a: Tensor, scalar: Union[int, float]) -> Tensor:\n", + " \"\"\"\n", + " Divides a tensor by a scalar.\n", + "\n", + " Args:\n", + " a (Tensor): The tensor to divide.\n", + " scalar (int, float): The scalar to divide the tensor by.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after division.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> scalar = 2\n", + " >>> result = divide_scalar(a, scalar)\n", + " >>> print(result)\n", + " Tensor([0.5, 1.0, 1.5])\n", + " \"\"\"\n", + " return DivScalar(scalar)(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc235f7-c1f4-42b5-b3a1-d9cb909617d9", + "metadata": { + "language": "python" + }, + "outputs": [], + "source": [ + "import nbdev; nbdev.nbdev_export()" + ] } ], "metadata": { diff --git a/minima/_modidx.py b/minima/_modidx.py index 8a816e9..f471ccc 100644 --- a/minima/_modidx.py +++ b/minima/_modidx.py @@ -18,9 +18,14 @@ 'minima.autograd.Tensor': ('autograd.html#tensor', 'minima/autograd.py'), 'minima.autograd.Tensor.__add__': ('autograd.html#tensor.__add__', 'minima/autograd.py'), 'minima.autograd.Tensor.__init__': ('autograd.html#tensor.__init__', 'minima/autograd.py'), + 'minima.autograd.Tensor.__matmul__': ('autograd.html#tensor.__matmul__', 'minima/autograd.py'), 'minima.autograd.Tensor.__mul__': ('autograd.html#tensor.__mul__', 'minima/autograd.py'), + 'minima.autograd.Tensor.__pow__': ('autograd.html#tensor.__pow__', 'minima/autograd.py'), 'minima.autograd.Tensor.__repr__': ('autograd.html#tensor.__repr__', 'minima/autograd.py'), + 'minima.autograd.Tensor.__rtruediv__': ('autograd.html#tensor.__rtruediv__', 'minima/autograd.py'), 'minima.autograd.Tensor.__str__': ('autograd.html#tensor.__str__', 'minima/autograd.py'), + 'minima.autograd.Tensor.__sub__': ('autograd.html#tensor.__sub__', 'minima/autograd.py'), + 'minima.autograd.Tensor.__truediv__': ('autograd.html#tensor.__truediv__', 'minima/autograd.py'), 'minima.autograd.Tensor._array_from_numpy': ( 'autograd.html#tensor._array_from_numpy', 'minima/autograd.py'), 'minima.autograd.Tensor._backward': ('autograd.html#tensor._backward', 'minima/autograd.py'), @@ -53,6 +58,7 @@ 'minima.autograd.Value.__truediv__': ('autograd.html#value.__truediv__', 'minima/autograd.py'), 'minima.autograd.Value._topological_sort': ('autograd.html#value._topological_sort', 'minima/autograd.py'), 'minima.autograd.Value.backward': ('autograd.html#value.backward', 'minima/autograd.py'), + 'minima.autograd.Value.data': ('autograd.html#value.data', 'minima/autograd.py'), 'minima.autograd.Value.exp': ('autograd.html#value.exp', 'minima/autograd.py'), 'minima.autograd.Value.is_leaf': ('autograd.html#value.is_leaf', 'minima/autograd.py'), 'minima.autograd.Value.item': ('autograd.html#value.item', 'minima/autograd.py'), diff --git a/minima/autograd.py b/minima/autograd.py index 896cc16..34e3733 100644 --- a/minima/autograd.py +++ b/minima/autograd.py @@ -34,7 +34,7 @@ class Value: requires_grad (bool) """ def __init__(self, data, _children=(), _op='', label=''): - self.data = data + self._data = data self.grad = 0 self._prev = set(_children) self._op = _op @@ -42,12 +42,12 @@ def __init__(self, data, _children=(), _op='', label=''): def __add__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data + other.data, (self, other), '+') + out = Value(self._data + other.data, (self, other), '+') return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data * other.data, (self, other), '*') + out = Value(self._data * other.data, (self, other), '*') return out @@ -68,7 +68,7 @@ class Value: requires_grad (bool) """ def __init__(self, data, _children=(), _op='', label=''): - self.data = data + self._data = data self.grad = 0 self._backward = lambda: None self._prev = set(_children) @@ -77,7 +77,7 @@ def __init__(self, data, _children=(), _op='', label=''): def __add__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data + other.data, (self, other), '+') + out = Value(self._data + other.data, (self, other), '+') def _backward(): self.grad += out.grad @@ -88,17 +88,17 @@ def _backward(): def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data * other.data, (self, other), '*') + out = Value(self._data * other.data, (self, other), '*') def _backward(): self.grad += other.data * out.grad - other.grad += self.data * out.grad + other.grad += self._data * out.grad out._backward = _backward return out def __repr__(self): - return f"Value(data={self.data}, grad={self.grad})" + return f"Value(data={self._data}, grad={self.grad})" # %% ../nbs/00_autograd.ipynb 67 class Value: @@ -116,7 +116,7 @@ class Value: requires_grad (bool) """ def __init__(self, data, _children=(), _op='', label=''): - self.data = data + self._data = data self.grad = 0 self._backward = lambda: None self._prev = set(_children) @@ -125,7 +125,7 @@ def __init__(self, data, _children=(), _op='', label=''): def __add__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data + other.data, (self, other), '+') + out = Value(self._data + other.data, (self, other), '+') def _backward(): self.grad += out.grad @@ -136,17 +136,17 @@ def _backward(): def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) - out = Value(self.data * other.data, (self, other), '*') + out = Value(self._data * other.data, (self, other), '*') def _backward(): self.grad += other.data * out.grad - other.grad += self.data * out.grad + other.grad += self._data * out.grad out._backward = _backward return out def __repr__(self): - return f"Value(data={self.data}, grad={self.grad})" + return f"Value(data={self._data}, grad={self.grad})" def backward(self): @@ -205,7 +205,7 @@ def __init__( - label (str): a label for this node, used for debugging and visualization purposes """ - self.data = data + self._data = data self._prev = set(children) self._op = op self.grad = 0.0 @@ -219,7 +219,7 @@ def __repr__(self): Returns: - str: a string representation of this Value object """ - return f"Value({self.data})" + return f"Value({self._data})" def __add__(self, other): """ @@ -234,7 +234,7 @@ def __add__(self, other): other = Value(other) if not isinstance(other, Value) else other - out = Value(self.data + other.data, children=(self, other), op='+') + out = Value(self._data + other.data, children=(self, other), op='+') def _backward(): self.grad += 1 * out.grad @@ -291,11 +291,11 @@ def __mul__(self, other): """ other = Value(other) if not isinstance(other, Value) else other - out = Value(self.data * other.data, children=(self, other), op='*') + out = Value(self._data * other.data, children=(self, other), op='*') def _backward(): self.grad += other.data * out.grad - other.grad += self.data * out.grad + other.grad += self._data * out.grad out._backward = _backward return out @@ -334,9 +334,9 @@ def __pow__(self, other): """ assert isinstance(other, (float, int)), "other must be a scalar" - out = Value(self.data ** other, children=(self, ), op='**') + out = Value(self._data ** other, children=(self, ), op='**') - def _backward(): self.grad += other * self.data ** (other - 1) * out.grad + def _backward(): self.grad += other * self._data ** (other - 1) * out.grad out._backward = _backward return out @@ -359,10 +359,10 @@ def exp(self): - The _backward() function is assigned as an attribute to the output object for later use during backpropagation. """ - x = math.exp(self.data) + x = math.exp(self._data) out = Value(x, children=(self,), op='exp') - def _backward(): self.grad += x * out.grad # x = exp(self.data) so x' = x (derivative of exp(x) is exp(x)) + def _backward(): self.grad += x * out.grad # x = exp(self._data) so x' = x (derivative of exp(x) is exp(x)) out._backward = _backward return out @@ -372,7 +372,7 @@ def tanh(self): Applies the hyperbolic tangent function to the data of this `Value` object and returns a new `Value` object with the resulting data. This operation is an element-wise operation. """ - out = Value(torch.tanh(torch.tensor(self.data)), children=(self,), op='tanh') + out = Value(torch.tanh(torch.tensor(self._data)), children=(self,), op='tanh') def _backward(): self.grad += (1 - out.data ** 2) * out.grad out._backward = _backward @@ -383,13 +383,30 @@ def relu(self): Applies the rectified linear unit function to the data of this `Value` object and returns a new `Value` object with the resulting data. This operation is an element-wise operation. """ - out = Value(max(0, self.data), children=(self,), op='relu') + out = Value(max(0, self._data), children=(self,), op='relu') def _backward(): self.grad += (out.data > 0) * out.grad out._backward = _backward return out + @property + def data(self): + """ + Returns a tensor that shares the data with the current tensor but is detached from the computational graph. + + Example: + >>> t = Tensor([1, 2, 3], requires_grad=True) + >>> print(t.data) + Tensor([1, 2, 3]) + """ + return self._data + + @data.setter + def data(self, value): + self._data = value + + def item(self): """ Return the scalar value being stored in the current Value as a Python float. @@ -401,7 +418,7 @@ def item(self): float: The scalar value being stored in the current Value as a Python float. """ - return self.data + return self._data def is_leaf(self): return self.op is None @@ -573,7 +590,7 @@ def __init__( device = device if device else cpu() data = Tensor._array_from_numpy(array, device=device, dtype=dtype) - self._init(None, set(), data=data, requires_grad=requires_grad, ) + self._init(None, (), data=data, requires_grad=requires_grad, ) def __repr__(self): return "minima.Tensor(" + str(self.realize_data()) + ")" @@ -609,7 +626,7 @@ def _init( if requires_grad is None: requires_grad = any(child.requires_grad for child in children) self._op = op - self.data = data + self._data = data self.children = children self.num_outputs = num_outputs self.requires_grad = requires_grad @@ -623,9 +640,9 @@ def realize_data(self): The actual data of this tensor. """ - if self.data is None: - self.data = self._op.compute(*[child.realize_data() for child in self.children]) - return self.data + if self._data is None: + self._data = self._op.compute(*[child.realize_data() for child in self.children]) + return self._data @staticmethod def _array_from_numpy(numpy_array, device, dtype): @@ -754,12 +771,13 @@ def data(self, value): >>> print(t.data) Tensor([4, 5, 6]) """ + print(type(value), type(Tensor)) assert isinstance(value, Tensor) assert value.dtype == self.dtype, "The dtype of the given tensor (%s) is not the same as the dtype of the current tensor (%s)." % ( value.dtype, self.dtype, ) - self.data = value.realize_data() + self._data = value.realize_data() @property @@ -823,10 +841,39 @@ def __add__(self, other: Union['Tensor', int, float]) -> 'Tensor': return mi.operators.EWiseAdd()(self, other) elif isinstance(other, (int, float)): - return mi.operators.AddScalar(other)(self) + return mi.operators.AddScalar(scalar=other)(self) else: raise ValueError(f"Unsupported operand type for +: '{type(self).__name__}' and '{type(other).__name__}'") + + def __sub__(self, other: Union['Tensor', int, float]) -> 'Tensor': + """ + Implements the subtraction operation between two Tensors or a Tensor and a scalar. + + Args: + - other (Tensor or scalar): the other Tensor or scalar to subtract from this one + + Returns: + - Tensor: a new Tensor object representing the difference between this Tensor and the other one + + Raises: + - AssertionError: If the two Tensors don't have the same shape + - ValueError: If the other operand is neither a Tensor nor a scalar + """ + if isinstance(other, Tensor): + # Ensure both tensors have the same shape for subtraction + if self.shape != other.shape: + raise AssertionError(f"Tensors must be of the same shape for subtraction. Got {self.shape} and {other.shape}.") + + return mi.operators.EWiseAdd()(negate(self), other) + + elif isinstance(other, (int, float)): + return mi.operators.AddScalar(scalar=-other)(self) + + else: + raise ValueError(f"Unsupported operand type for -: '{type(self).__name__}' and '{type(other).__name__}'") + + def __mul__(self, other: Union['Tensor', int, float]) -> 'Tensor': """ @@ -846,12 +893,75 @@ def __mul__(self, other: Union['Tensor', int, float]) -> 'Tensor': return mi.operators.EWiseMul()(self, other) elif isinstance(other, (int, float)): - return mi.operators.MulScalar(other)(self) + return mi.operators.MulScalar(scalar=other)(self) else: raise ValueError(f"Unsupported operand type for *: '{type(self).__name__}' and '{type(other).__name__}'") - + def __pow__(self, other): + + if isinstance(other, Tensor): + raise NotImplementedError() + if isinstance(other, (int, float)): + return mi.operators.PowerScalar(scalar=other)(self) + else: + raise ValueError(f"Unsupported operand type for ^: '{type(self).__name__}' and '{type(other).__name__}'") + + def __truediv__(self, other: Union['Tensor', int, float]) -> 'Tensor': + """ + Implements the division operation between two Tensors or a Tensor and a scalar. + + Args: + - other (Tensor or scalar): the other Tensor or scalar to divide to this one + + Returns: + - Tensor: a new Tensor object representing the result of division of this Tensor and the other one + """ + if isinstance(other, Tensor): + # Ensure both tensors have the same shape for addition + if self.shape != other.shape: + raise AssertionError(f"Tensors must be of the same shape for addition. Got {self.shape} and {other.shape}.") + + return mi.operators.EWiseDiv()(self, other) + + elif isinstance(other, (int, float)): + return mi.operators.DivScalar(scalar=other)(self) + + else: + raise ValueError(f"Unsupported operand type for /: '{type(self).__name__}' and '{type(other).__name__}'") + + + def __rtruediv__(self, other): # other / self + """ + Implements the right division operation between a scalar or a Tensor and this Tensor. + + Args: + - other (Tensor or scalar): the other Tensor or scalar to divide by this one + + Returns: + - Tensor: a new Tensor object representing the result of the division + + Example: + - If the method is called as `other.__rtruediv__(self)`, this corresponds to `other / self` in usual operations. + """ + return self.__pow__(-1).__mul__(other) + # other * self**-1 + + def __matmul__(self, other): + """ + Implements the matrix multiplication operation between this Tensor and another Tensor. + + Args: + - other (Tensor): the other Tensor to multiply with this one + + Returns: + - Tensor: a new Tensor object representing the result of the matrix multiplication + + Example: + - If the method is called as `self.__matmul__(other)`, this corresponds to `self @ other` in usual operations. + """ + return mi.ops.MatMul()(self, other) + def _backward(self, out_grad: 'Tensor') -> None: pass diff --git a/minima/operators.py b/minima/operators.py index 1d92f2b..fe8adab 100644 --- a/minima/operators.py +++ b/minima/operators.py @@ -18,7 +18,7 @@ # as the backend for our computations, this line will change in later homeworks import numpy as ARRAY_API -# %% ../nbs/01_operators.ipynb 3 +# %% ../nbs/01_operators.ipynb 7 class EWiseAdd(TensorOp): """ Performs element-wise addition of two tensors. @@ -31,6 +31,7 @@ class EWiseAdd(TensorOp): >>> print(result) Tensor([5, 7, 9]) """ + def compute(self, a: NDArray, b: NDArray) -> NDArray: """ Computes the element-wise sum of two tensors. @@ -55,7 +56,7 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]: Returns: The gradients with respect to the inputs. """ - return out_grad, out_grad + return (out_grad, out_grad) def add(a: Tensor, b: Tensor) -> Tensor: """ @@ -70,7 +71,7 @@ def add(a: Tensor, b: Tensor) -> Tensor: """ return EWiseAdd()(a, b) -# %% ../nbs/01_operators.ipynb 4 +# %% ../nbs/01_operators.ipynb 11 class AddScalar(TensorOp): """ Performs addition of a tensor and a scalar. @@ -129,7 +130,7 @@ def add_scalar(a: Tensor, scalar: Union[int, float]) -> Tensor: """ return AddScalar(scalar)(a) -# %% ../nbs/01_operators.ipynb 5 +# %% ../nbs/01_operators.ipynb 14 class EWiseMul(TensorOp): """ Performs element-wise multiplication of two tensors. @@ -166,8 +167,8 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]: Returns: The gradients with respect to the inputs. """ - lhs, rhs = node.inputs - return out_grad * rhs, out_grad * lhs + a, b = node.inputs + return out_grad * b, out_grad * a def multiply(a: Tensor, b: Tensor) -> Tensor: """ @@ -182,7 +183,7 @@ def multiply(a: Tensor, b: Tensor) -> Tensor: """ return EWiseMul()(a, b) -# %% ../nbs/01_operators.ipynb 6 +# %% ../nbs/01_operators.ipynb 17 class MulScalar(TensorOp): """ Performs multiplication of a tensor and a scalar. diff --git a/nbs/00_autograd.ipynb b/nbs/00_autograd.ipynb index 398c4f8..9ab60f8 100644 --- a/nbs/00_autograd.ipynb +++ b/nbs/00_autograd.ipynb @@ -11,10 +11,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| default_exp autograd" @@ -22,10 +20,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| hide\n", @@ -34,10 +30,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -116,22 +110,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "a = 4\n", "b = -2\n", @@ -149,21 +130,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "d1: 3\n", - "d2: 2.9800000000000004\n", - "slope: -1.9999999999999574\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def f_a(a,b,c):\n", " h = 0.01\n", @@ -194,21 +163,9 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "d1: 3\n", - "d2: 3.04\n", - "slope: 4.0000000000000036\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def f_b(a,b,c):\n", " h = 0.01\n", @@ -270,10 +227,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "def trace(root):\n", @@ -325,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -345,7 +300,7 @@ " requires_grad (bool)\n", " \"\"\"\n", " def __init__(self, data, _children=(), _op='', label=''):\n", - " self.data = data\n", + " self._data = data\n", " self.grad = 0\n", " self._prev = set(_children)\n", " self._op = _op\n", @@ -353,176 +308,21 @@ "\n", " def __add__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data + other.data, (self, other), '+')\n", + " out = Value(self._data + other.data, (self, other), '+')\n", " return out\n", "\n", " def __mul__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data * other.data, (self, other), '*')\n", + " out = Value(self._data * other.data, (self, other), '*')\n", " return out\n", "\n" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648\n", - "\n", - "d data 4.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709506384*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503648->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709503648+->4709503648\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503168\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503168->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503216\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709503216->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709506384\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709506384*->4709506384\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499280\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709499280->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502400->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400*->4709502400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499376\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709499376->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "a = Value(2.0, label='a')\n", "b = Value(-3.0, label='b')\n", @@ -568,19 +368,9 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: 1.000000000000334\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def lol():\n", " h = 0.001\n", @@ -619,10 +409,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "L.grad = 1" @@ -644,19 +432,9 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: 3.9999999999995595\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "def lol(label):\n", " def foo(v, label):\n", @@ -691,10 +469,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "f.grad = 4" @@ -702,29 +478,17 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: -2.000000000000668\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "lol('d') " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "d.grad = -2" @@ -739,164 +503,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709506384*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503648->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709503648+->4709503648\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503168\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503168->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503216\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709503216->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709506384\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709506384*->4709506384\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499280\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709499280->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502400->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400*->4709502400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499376\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709499376->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] @@ -960,29 +569,17 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: -2.000000000000668\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "lol('e')" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "e.grad = -2 # 1 * d.grad" @@ -990,29 +587,17 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: -1.9999999999988916\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "lol('c')" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "c.grad = -2 # 1 * d.grad" @@ -1020,193 +605,26 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709506384*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503648->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709503648+->4709503648\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503168\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503168->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503216\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709503216->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709506384\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709506384*->4709506384\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499280\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709499280->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502400->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400*->4709502400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499376\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709499376->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: 6.000000000000227\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "lol('a')" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "a.grad = 6 # b * e.grad" @@ -1214,29 +632,17 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "grad: -3.9999999999995595\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "lol('b')" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "b.grad = -4 # a * e.grad" @@ -1244,174 +650,17 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709506384*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503648->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503648+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709503648+->4709503648\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503168\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 6.0000\n", - "\n", - "\n", - "\n", - "4709502400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709503168->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709503216\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709503216->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709506384\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709506384*->4709506384\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499280\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709499280->4709506384*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502400->4709503648+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502400*->4709502400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709499376\n", - "\n", - "b data -3.0000\n", - "\n", - "grad -4.0000\n", - "\n", - "\n", - "\n", - "4709499376->4709502400*\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -1430,7 +679,7 @@ " requires_grad (bool)\n", " \"\"\"\n", " def __init__(self, data, _children=(), _op='', label=''):\n", - " self.data = data\n", + " self._data = data\n", " self.grad = 0\n", " self._backward = lambda: None\n", " self._prev = set(_children)\n", @@ -1439,7 +688,7 @@ " \n", " def __add__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data + other.data, (self, other), '+')\n", + " out = Value(self._data + other.data, (self, other), '+')\n", "\n", " def _backward():\n", " self.grad += out.grad\n", @@ -1450,179 +699,24 @@ "\n", " def __mul__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data * other.data, (self, other), '*')\n", + " out = Value(self._data * other.data, (self, other), '*')\n", "\n", " def _backward():\n", " self.grad += other.data * out.grad\n", - " other.grad += self.data * out.grad\n", + " other.grad += self._data * out.grad\n", " out._backward = _backward\n", "\n", " return out\n", " \n", " def __repr__(self):\n", - " return f\"Value(data={self.data}, grad={self.grad})\"" + " return f\"Value(data={self._data}, grad={self.grad})\"" ] }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709513248\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709513248*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709513248*->4709513248\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544\n", - "\n", - "d data 4.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709502544->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709502544+->4709502544\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511808\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709511808->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511856\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709511856->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592*->4709514592\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709515168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709515168->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709512144\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709512144->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "a = Value(2.0, label='a')\n", "b = Value(-3.0, label='b')\n", @@ -1637,10 +731,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "L.grad = 1" @@ -1648,10 +740,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "L._backward()" @@ -1659,174 +749,17 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709513248\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709513248*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709513248*->4709513248\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502544->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709502544+->4709502544\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511808\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709511808->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511856\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709511856->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592*->4709514592\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709515168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709515168->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709512144\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709512144->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "d._backward()" @@ -1834,174 +767,17 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709513248\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709513248*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709513248*->4709513248\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502544->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709502544+->4709502544\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511808\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709511808->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511856\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709511856->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709514592->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592*->4709514592\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709515168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709515168->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709512144\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709512144->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "c._backward()" @@ -2016,174 +792,17 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709513248\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709513248*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709513248*->4709513248\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502544->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709502544+->4709502544\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511808\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709514592*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709511808->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511856\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709511856->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709514592->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592*->4709514592\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709515168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709515168->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709512144\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709512144->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "e._backward()" @@ -2191,164 +810,9 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709513248\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709513248*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709513248*->4709513248\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709502544->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709502544+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709502544+->4709502544\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511808\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 6.0000\n", - "\n", - "\n", - "\n", - "4709514592*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709511808->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709511856\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709511856->4709513248*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709514592->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709514592*->4709514592\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709515168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad -4.0000\n", - "\n", - "\n", - "\n", - "4709515168->4709514592*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709512144\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709512144->4709502544+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] @@ -2369,164 +833,9 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709603344\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709599024+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709603344->4709599024+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709603344*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709603344*->4709603344\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709602432\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709602432->4709603344*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709598400\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709598400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709598400*->4709598400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709607712\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709607712->4709598400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709599024\n", - "\n", - "d data 4.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709599024->4709598400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709599024+->4709599024\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709605168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709605168->4709603344*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709611840\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709611840->4709599024+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "a = Value(2.0, label='a')\n", "b = Value(-3.0, label='b')\n", @@ -2541,10 +850,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# topological order all of the children in the graph\n", @@ -2562,38 +869,17 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[Value(data=-2.0, grad=0),\n", - " Value(data=2.0, grad=0),\n", - " Value(data=-3.0, grad=0),\n", - " Value(data=-6.0, grad=0),\n", - " Value(data=10.0, grad=0),\n", - " Value(data=4.0, grad=0),\n", - " Value(data=-8.0, grad=0)]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "topo" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# go one variable at a time and apply the chain rule to get its gradient\n", @@ -2604,164 +890,9 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709603344\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709599024+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709603344->4709599024+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709603344*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709603344*->4709603344\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709602432\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 6.0000\n", - "\n", - "\n", - "\n", - "4709602432->4709603344*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709598400\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709598400*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709598400*->4709598400\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709607712\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709607712->4709598400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709599024\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709599024->4709598400*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709599024+->4709599024\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709605168\n", - "\n", - "b data -3.0000\n", - "\n", - "grad -4.0000\n", - "\n", - "\n", - "\n", - "4709605168->4709603344*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709611840\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709611840->4709599024+\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] @@ -2775,10 +906,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -2797,7 +926,7 @@ " requires_grad (bool)\n", " \"\"\"\n", " def __init__(self, data, _children=(), _op='', label=''):\n", - " self.data = data\n", + " self._data = data\n", " self.grad = 0\n", " self._backward = lambda: None\n", " self._prev = set(_children)\n", @@ -2806,7 +935,7 @@ " \n", " def __add__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data + other.data, (self, other), '+')\n", + " out = Value(self._data + other.data, (self, other), '+')\n", "\n", " def _backward():\n", " self.grad += out.grad\n", @@ -2817,17 +946,17 @@ "\n", " def __mul__(self, other):\n", " other = other if isinstance(other, Value) else Value(other)\n", - " out = Value(self.data * other.data, (self, other), '*')\n", + " out = Value(self._data * other.data, (self, other), '*')\n", "\n", " def _backward():\n", " self.grad += other.data * out.grad\n", - " other.grad += self.data * out.grad\n", + " other.grad += self._data * out.grad\n", " out._backward = _backward\n", "\n", " return out\n", " \n", " def __repr__(self):\n", - " return f\"Value(data={self.data}, grad={self.grad})\"\n", + " return f\"Value(data={self._data}, grad={self.grad})\"\n", " \n", " def backward(self):\n", "\n", @@ -2850,164 +979,9 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993104\n", - "\n", - "d data 4.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709998480*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709993104->4709998480*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993104+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709993104+->4709993104\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4710005008\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709990848*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4710005008->4709990848*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993920\n", - "\n", - "b data -3.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709993920->4709990848*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4710004096\n", - "\n", - "c data 10.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4710004096->4709993104+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709998480\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709998480*->4709998480\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993392\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709993392->4709998480*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709990848\n", - "\n", - "e data -6.0000\n", - "\n", - "grad 0.0000\n", - "\n", - "\n", - "\n", - "4709990848->4709993104+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709990848*->4709990848\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "a = Value(2.0, label='a')\n", "b = Value(-3.0, label='b')\n", @@ -3022,10 +996,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "L.backward()" @@ -3033,174 +1005,17 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993104\n", - "\n", - "d data 4.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709998480*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4709993104->4709998480*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993104+\n", - "\n", - "+\n", - "\n", - "\n", - "\n", - "4709993104+->4709993104\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4710005008\n", - "\n", - "a data 2.0000\n", - "\n", - "grad 6.0000\n", - "\n", - "\n", - "\n", - "4709990848*\n", - "\n", - "*\n", - "\n", - "\n", - "\n", - "4710005008->4709990848*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993920\n", - "\n", - "b data -3.0000\n", - "\n", - "grad -4.0000\n", - "\n", - "\n", - "\n", - "4709993920->4709990848*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4710004096\n", - "\n", - "c data 10.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4710004096->4709993104+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709998480\n", - "\n", - "L data -8.0000\n", - "\n", - "grad 1.0000\n", - "\n", - "\n", - "\n", - "4709998480*->4709998480\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709993392\n", - "\n", - "f data -2.0000\n", - "\n", - "grad 4.0000\n", - "\n", - "\n", - "\n", - "4709993392->4709998480*\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709990848\n", - "\n", - "e data -6.0000\n", - "\n", - "grad -2.0000\n", - "\n", - "\n", - "\n", - "4709990848->4709993104+\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "4709990848*->4709990848\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "draw_dot(L)" ] }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3242,7 +1057,7 @@ " - label (str): a label for this node, used for debugging and visualization purposes\n", " \"\"\"\n", " \n", - " self.data = data\n", + " self._data = data\n", " self._prev = set(children)\n", " self._op = op\n", " self.grad = 0.0\n", @@ -3256,7 +1071,7 @@ " Returns:\n", " - str: a string representation of this Value object\n", " \"\"\"\n", - " return f\"Value({self.data})\"\n", + " return f\"Value({self._data})\"\n", " \n", " def __add__(self, other):\n", " \"\"\"\n", @@ -3271,7 +1086,7 @@ " \n", " other = Value(other) if not isinstance(other, Value) else other\n", " \n", - " out = Value(self.data + other.data, children=(self, other), op='+')\n", + " out = Value(self._data + other.data, children=(self, other), op='+')\n", " \n", " def _backward():\n", " self.grad += 1 * out.grad\n", @@ -3328,11 +1143,11 @@ " \"\"\"\n", " \n", " other = Value(other) if not isinstance(other, Value) else other\n", - " out = Value(self.data * other.data, children=(self, other), op='*')\n", + " out = Value(self._data * other.data, children=(self, other), op='*')\n", " \n", " def _backward():\n", " self.grad += other.data * out.grad\n", - " other.grad += self.data * out.grad\n", + " other.grad += self._data * out.grad\n", " \n", " out._backward = _backward\n", " return out\n", @@ -3371,9 +1186,9 @@ " \"\"\"\n", " assert isinstance(other, (float, int)), \"other must be a scalar\"\n", " \n", - " out = Value(self.data ** other, children=(self, ), op='**')\n", + " out = Value(self._data ** other, children=(self, ), op='**')\n", " \n", - " def _backward(): self.grad += other * self.data ** (other - 1) * out.grad\n", + " def _backward(): self.grad += other * self._data ** (other - 1) * out.grad\n", " out._backward = _backward\n", " \n", " return out\n", @@ -3396,10 +1211,10 @@ " - The _backward() function is assigned as an attribute to the output object for later use during backpropagation.\n", " \"\"\"\n", " \n", - " x = math.exp(self.data)\n", + " x = math.exp(self._data)\n", " out = Value(x, children=(self,), op='exp')\n", " \n", - " def _backward(): self.grad += x * out.grad # x = exp(self.data) so x' = x (derivative of exp(x) is exp(x))\n", + " def _backward(): self.grad += x * out.grad # x = exp(self._data) so x' = x (derivative of exp(x) is exp(x))\n", " out._backward = _backward\n", " \n", " return out\n", @@ -3409,7 +1224,7 @@ " Applies the hyperbolic tangent function to the data of this `Value` object and returns a new `Value` object \n", " with the resulting data. This operation is an element-wise operation.\n", " \"\"\"\n", - " out = Value(torch.tanh(torch.tensor(self.data)), children=(self,), op='tanh')\n", + " out = Value(torch.tanh(torch.tensor(self._data)), children=(self,), op='tanh')\n", " def _backward(): self.grad += (1 - out.data ** 2) * out.grad\n", " out._backward = _backward\n", " \n", @@ -3420,13 +1235,30 @@ " Applies the rectified linear unit function to the data of this `Value` object and returns a new `Value` object \n", " with the resulting data. This operation is an element-wise operation.\n", " \"\"\"\n", - " out = Value(max(0, self.data), children=(self,), op='relu')\n", + " out = Value(max(0, self._data), children=(self,), op='relu')\n", " \n", " def _backward(): self.grad += (out.data > 0) * out.grad\n", " out._backward = _backward\n", " \n", " return out\n", " \n", + " @property\n", + " def data(self):\n", + " \"\"\"\n", + " Returns a tensor that shares the data with the current tensor but is detached from the computational graph.\n", + "\n", + " Example:\n", + " >>> t = Tensor([1, 2, 3], requires_grad=True)\n", + " >>> print(t.data)\n", + " Tensor([1, 2, 3])\n", + " \"\"\"\n", + " return self._data\n", + "\n", + " @data.setter\n", + " def data(self, value):\n", + " self._data = value\n", + "\n", + " \n", " def item(self):\n", " \"\"\"\n", " Return the scalar value being stored in the current Value as a Python float.\n", @@ -3438,7 +1270,7 @@ " float: The scalar value being stored in the current Value as a Python float.\n", " \n", " \"\"\"\n", - " return self.data\n", + " return self._data\n", " \n", " def is_leaf(self):\n", " return self.op is None\n", @@ -3498,10 +1330,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3512,10 +1342,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3549,10 +1377,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3570,10 +1396,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3586,9 +1410,7 @@ }, { "cell_type": "markdown", - "metadata": { - "tags": [] - }, + "metadata": {}, "source": [ "\n", "``` python\n", @@ -3622,10 +1444,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -3690,7 +1510,7 @@ " device = device if device else cpu()\n", " data = Tensor._array_from_numpy(array, device=device, dtype=dtype)\n", "\n", - " self._init(None, set(), data=data, requires_grad=requires_grad, )\n", + " self._init(None, (), data=data, requires_grad=requires_grad, )\n", " \n", " def __repr__(self):\n", " return \"minima.Tensor(\" + str(self.realize_data()) + \")\"\n", @@ -3726,7 +1546,7 @@ " if requires_grad is None:\n", " requires_grad = any(child.requires_grad for child in children)\n", " self._op = op\n", - " self.data = data\n", + " self._data = data\n", " self.children = children\n", " self.num_outputs = num_outputs\n", " self.requires_grad = requires_grad\n", @@ -3740,9 +1560,9 @@ " The actual data of this tensor.\n", " \"\"\"\n", " \n", - " if self.data is None:\n", - " self.data = self._op.compute(*[child.realize_data() for child in self.children])\n", - " return self.data\n", + " if self._data is None:\n", + " self._data = self._op.compute(*[child.realize_data() for child in self.children])\n", + " return self._data\n", " \n", " @staticmethod\n", " def _array_from_numpy(numpy_array, device, dtype):\n", @@ -3871,12 +1691,13 @@ " >>> print(t.data)\n", " Tensor([4, 5, 6])\n", " \"\"\"\n", + " print(type(value), type(Tensor))\n", " assert isinstance(value, Tensor)\n", " assert value.dtype == self.dtype, \"The dtype of the given tensor (%s) is not the same as the dtype of the current tensor (%s).\" % (\n", " value.dtype,\n", " self.dtype,\n", " )\n", - " self.data = value.realize_data()\n", + " self._data = value.realize_data()\n", "\n", " \n", " @property\n", @@ -3940,10 +1761,39 @@ " return mi.operators.EWiseAdd()(self, other)\n", "\n", " elif isinstance(other, (int, float)):\n", - " return mi.operators.AddScalar(other)(self)\n", + " return mi.operators.AddScalar(scalar=other)(self)\n", "\n", " else:\n", " raise ValueError(f\"Unsupported operand type for +: '{type(self).__name__}' and '{type(other).__name__}'\")\n", + " \n", + " def __sub__(self, other: Union['Tensor', int, float]) -> 'Tensor':\n", + " \"\"\"\n", + " Implements the subtraction operation between two Tensors or a Tensor and a scalar.\n", + "\n", + " Args:\n", + " - other (Tensor or scalar): the other Tensor or scalar to subtract from this one\n", + "\n", + " Returns:\n", + " - Tensor: a new Tensor object representing the difference between this Tensor and the other one\n", + "\n", + " Raises:\n", + " - AssertionError: If the two Tensors don't have the same shape\n", + " - ValueError: If the other operand is neither a Tensor nor a scalar\n", + " \"\"\"\n", + " if isinstance(other, Tensor):\n", + " # Ensure both tensors have the same shape for subtraction\n", + " if self.shape != other.shape:\n", + " raise AssertionError(f\"Tensors must be of the same shape for subtraction. Got {self.shape} and {other.shape}.\")\n", + "\n", + " return mi.operators.EWiseAdd()(negate(self), other)\n", + "\n", + " elif isinstance(other, (int, float)):\n", + " return mi.operators.AddScalar(scalar=-other)(self)\n", + "\n", + " else:\n", + " raise ValueError(f\"Unsupported operand type for -: '{type(self).__name__}' and '{type(other).__name__}'\")\n", + "\n", + "\n", " \n", " def __mul__(self, other: Union['Tensor', int, float]) -> 'Tensor':\n", " \"\"\"\n", @@ -3963,22 +1813,83 @@ " return mi.operators.EWiseMul()(self, other)\n", "\n", " elif isinstance(other, (int, float)):\n", - " return mi.operators.MulScalar(other)(self)\n", + " return mi.operators.MulScalar(scalar=other)(self)\n", "\n", " else:\n", " raise ValueError(f\"Unsupported operand type for *: '{type(self).__name__}' and '{type(other).__name__}'\")\n", " \n", - " \n", + " def __pow__(self, other):\n", + " \n", + " if isinstance(other, Tensor):\n", + " raise NotImplementedError() \n", + " if isinstance(other, (int, float)):\n", + " return mi.operators.PowerScalar(scalar=other)(self)\n", + " else:\n", + " raise ValueError(f\"Unsupported operand type for ^: '{type(self).__name__}' and '{type(other).__name__}'\")\n", + "\n", + " def __truediv__(self, other: Union['Tensor', int, float]) -> 'Tensor':\n", + " \"\"\"\n", + " Implements the division operation between two Tensors or a Tensor and a scalar.\n", + "\n", + " Args:\n", + " - other (Tensor or scalar): the other Tensor or scalar to divide to this one\n", + "\n", + " Returns:\n", + " - Tensor: a new Tensor object representing the result of division of this Tensor and the other one\n", + " \"\"\"\n", + " if isinstance(other, Tensor):\n", + " # Ensure both tensors have the same shape for addition\n", + " if self.shape != other.shape:\n", + " raise AssertionError(f\"Tensors must be of the same shape for addition. Got {self.shape} and {other.shape}.\")\n", + "\n", + " return mi.operators.EWiseDiv()(self, other)\n", + "\n", + " elif isinstance(other, (int, float)):\n", + " return mi.operators.DivScalar(scalar=other)(self)\n", + "\n", + " else:\n", + " raise ValueError(f\"Unsupported operand type for /: '{type(self).__name__}' and '{type(other).__name__}'\")\n", + "\n", + " \n", + " def __rtruediv__(self, other): # other / self\n", + " \"\"\"\n", + " Implements the right division operation between a scalar or a Tensor and this Tensor.\n", + "\n", + " Args:\n", + " - other (Tensor or scalar): the other Tensor or scalar to divide by this one\n", + "\n", + " Returns:\n", + " - Tensor: a new Tensor object representing the result of the division\n", + "\n", + " Example:\n", + " - If the method is called as `other.__rtruediv__(self)`, this corresponds to `other / self` in usual operations.\n", + " \"\"\"\n", + " return self.__pow__(-1).__mul__(other)\n", + " # other * self**-1\n", + "\n", + " def __matmul__(self, other):\n", + " \"\"\"\n", + " Implements the matrix multiplication operation between this Tensor and another Tensor.\n", + "\n", + " Args:\n", + " - other (Tensor): the other Tensor to multiply with this one\n", + "\n", + " Returns:\n", + " - Tensor: a new Tensor object representing the result of the matrix multiplication\n", + "\n", + " Example:\n", + " - If the method is called as `self.__matmul__(other)`, this corresponds to `self @ other` in usual operations.\n", + " \"\"\"\n", + " return mi.ops.MatMul()(self, other)\n", + "\n", " def _backward(self, out_grad: 'Tensor') -> None:\n", " pass\n" ] }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "#| hide\n", @@ -4053,7 +1964,259 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import numpy as np\n", + "import unittest\n", + "from minima.autograd import Tensor\n", + "\n", + "class TestTensor(unittest.TestCase):\n", + " \n", + " def test_create_tensor(self):\n", + " t1 = Tensor([1, 2, 3])\n", + " self.assertTrue(np.array_equal(t1.realize_data(), np.array([1, 2, 3])))\n", + " self.assertEqual(t1.shape, (3,))\n", + " self.assertEqual(t1.dtype, np.float64)\n", + " \n", + " t2 = Tensor([[1, 2], [3, 4]])\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([[1, 2], [3, 4]])))\n", + " self.assertEqual(t2.shape, (2, 2))\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " \n", + " t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n", + " self.assertTrue(np.array_equal(t3.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n", + " self.assertEqual(t3.shape, (3,))\n", + " self.assertEqual(t3.dtype, np.int32)\n", + " \n", + " def test_create_tensor_from_tensor(self):\n", + " t1 = Tensor([1, 2, 3])\n", + " t2 = Tensor(t1)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3])))\n", + " self.assertEqual(t2.shape, (3,))\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " \n", + " t3 = Tensor(np.array([1, 2, 3]), dtype=np.int32)\n", + " t4 = Tensor(t3)\n", + " self.assertTrue(np.array_equal(t4.realize_data(), np.array([1, 2, 3], dtype=np.int32)))\n", + " self.assertEqual(t4.shape, (3,))\n", + " self.assertEqual(t4.dtype, np.int32)\n", + " \n", + " def test_create_tensor_with_device(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu')\n", + " self.assertEqual(t1.device, 'cpu')\n", + " \n", + " t2 = Tensor([1, 2, 3], device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_with_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], requires_grad=True)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " t2 = Tensor([1, 2, 3], requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_with_kwargs(self):\n", + " t1 = Tensor([1, 2, 3], device='cuda', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t1.device, 'cuda')\n", + " self.assertEqual(t1.dtype, np.float32)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " def test_create_tensor_from_numpy(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array)\n", + " self.assertTrue(np.array_equal(t1.realize_data(), np_array))\n", + " self.assertEqual(t1.shape, (3,))\n", + " self.assertEqual(t1.dtype, np.float64)\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np_array))\n", + " self.assertEqual(t2.shape, (3,))\n", + " self.assertEqual(t2.dtype, np.int32)\n", + " \n", + " def test_create_tensor_from_numpy_with_device(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, device='cuda')\n", + " self.assertEqual(t1.device, 'cuda')\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array, device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_from_numpy_with_requires_grad(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, requires_grad=True)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " np_array = np.array([1, 2, 3], dtype=np.int32)\n", + " t2 = Tensor(np_array, requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_numpy_with_kwargs(self):\n", + " np_array = np.array([1, 2, 3])\n", + " t1 = Tensor(np_array, device='cuda', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t1.device, 'cuda')\n", + " self.assertEqual(t1.dtype, np.float32)\n", + " self.assertTrue(t1.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_device(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu')\n", + " t2 = Tensor(t1, device='cuda')\n", + " self.assertEqual(t2.device, 'cuda')\n", + " \n", + " def test_create_tensor_from_tensor_with_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], requires_grad=True)\n", + " t2 = Tensor(t1, requires_grad=False)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_kwargs(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_same_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cpu', dtype=np.float32, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cpu')\n", + " self.assertEqual(t2.dtype, np.float32)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float32)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_true_false(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=True)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=False)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertFalse(t2.requires_grad)\n", + " \n", + " def test_create_tensor_from_tensor_with_different_device_and_dtype_and_requires_grad_false_true(self):\n", + " t1 = Tensor([1, 2, 3], device='cpu', dtype=np.float32, requires_grad=False)\n", + " t2 = Tensor(t1, device='cuda', dtype=np.float64, requires_grad=True)\n", + " self.assertEqual(t2.device, 'cuda')\n", + " self.assertEqual(t2.dtype, np.float64)\n", + " self.assertTrue(np.array_equal(t2.realize_data(), np.array([1, 2, 3], dtype=np.float64)))\n", + " self.assertTrue(t2.requires_grad)" + ] }, { "cell_type": "code", @@ -4092,10 +2255,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "metadata": { - "tags": [] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import nbdev; nbdev.nbdev_export()" @@ -4111,21 +2272,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" } }, "nbformat": 4, diff --git a/nbs/01_operators.ipynb b/nbs/01_operators.ipynb index 197514c..2faf445 100644 --- a/nbs/01_operators.ipynb +++ b/nbs/01_operators.ipynb @@ -12,11 +12,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "06cf77c9-d5dc-473d-9624-6679191a3c6a", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| default_exp operators" @@ -24,11 +22,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "5485133d-9a30-4362-af02-d6724482f459", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -47,13 +43,81 @@ "import numpy as ARRAY_API" ] }, + { + "cell_type": "markdown", + "id": "8c9a4956-5575-4950-9365-560225c3a715", + "metadata": {}, + "source": [ + "The `out_grad` parameter refers to the gradient of the loss function with respect to the output of the node. Multiplying this with the local gradient gives the gradient of the loss with respect to the input to the node, according to the chain rule of calculus, which is the basis for backpropagation in neural networks." + ] + }, + { + "cell_type": "markdown", + "id": "40c0867e-7744-4c76-95b6-da3868dc8625", + "metadata": {}, + "source": [ + "The chain rule is a fundamental concept in calculus that provides a method to compute the derivative of composite functions. In simple terms, the chain rule states that the derivative of a composite function is the derivative of the outer function multiplied by the derivative of the inner function.\n", + "\n", + "Given a composite function that is the composition of two functions, say, $f(g(x))$, the chain rule can be stated as follows:\n", + "\n", + "$$\\frac{df}{dx} = \\frac{df}{dg} \\cdot \\frac{dg}{dx}$$\n", + "\n", + "Where:\n", + "\n", + "- $\\frac{df}{dx}$ is the derivative of the composite function $f(g(x))$ with respect to $x$,\n", + "- $\\frac{df}{dg}$ is the derivative of the outer function $f$ with respect to its argument $g(x)$, and\n", + "- $\\frac{dg}{dx}$ is the derivative of the inner function $g(x)$ with respect to $x$.\n", + "\n", + "The chain rule can be extended to the case where we have more than two composite functions." + ] + }, + { + "cell_type": "markdown", + "id": "83cb2320-c471-426d-ad48-0197b1daecaa", + "metadata": {}, + "source": [ + "## Element Wise Addition" + ] + }, + { + "cell_type": "markdown", + "id": "feadcd15-44d4-4a3d-aef9-39b3b7f6fcd1", + "metadata": {}, + "source": [ + "Let's walk through the step-by-step derivative calculation for the `EWiseAdd` operation:\n", + "\n", + "We have the function `f(a, b) = a + b`, where `a` and `b` are tensors. Our goal is to compute the partial derivatives with respect to `a` and `b`.\n", + "\n", + "Let's start by calculating the derivative of `f` with respect to `a`, denoted as `df/da`:\n", + "\n", + "Step 1: Compute the derivative of `f` with respect to `a`.\n", + "\n", + "$\\frac{{\\partial f}}{{\\partial a}} = \\frac{{\\partial}}{{\\partial a}} (a + b)$\n", + "\n", + "Since `a` is the variable we are differentiating with respect to, the derivative of `a` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial a}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial a}} = 1.$$\n", + "\n", + "Step 2: Compute the derivative of `f` with respect to `b`.\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = \\frac{{\\partial}}{{\\partial b}} (a + b)$$\n", + "\n", + "Again, since `b` is the variable we are differentiating with respect to, the derivative of `b` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Hence, the partial derivatives of `f(a, b) = a + b` with respect to `a` and `b` are both equal to 1." + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "bb4e9796-2da9-4668-ae19-cf547e25ddd1", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -69,6 +133,7 @@ " >>> print(result)\n", " Tensor([5, 7, 9])\n", " \"\"\"\n", + " \n", " def compute(self, a: NDArray, b: NDArray) -> NDArray:\n", " \"\"\"\n", " Computes the element-wise sum of two tensors.\n", @@ -93,7 +158,7 @@ " Returns:\n", " The gradients with respect to the inputs.\n", " \"\"\"\n", - " return out_grad, out_grad\n", + " return (out_grad, out_grad)\n", "\n", "def add(a: Tensor, b: Tensor) -> Tensor:\n", " \"\"\"\n", @@ -111,11 +176,58 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, + "id": "358fcf64-9c44-4d44-8374-a0ef11668d6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Create two 1-D tensors\n", + "a = Tensor([1, 2, 3])\n", + "b = Tensor([4, 5, 6])\n", + "\n", + "# Create an EWiseAdd operation\n", + "op = EWiseAdd()" + ] + }, + { + "cell_type": "markdown", + "id": "bd371e81-b6e4-43da-9987-30057c5c038a", + "metadata": {}, + "source": [ + "## Scalar Addition" + ] + }, + { + "cell_type": "markdown", + "id": "b2dbc8dc-25ae-4793-9cc8-cdbef59a8400", + "metadata": {}, + "source": [ + "Explanation for the derivative of the `AddScalar` operator:\n", + "\n", + "Let's denote the scalar as `c` and `a` as the tensor being added by the scalar. The operation can be described as `f(a) = a + c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1`, which means the derivative of `f(a)` with respect to `a` is simply `1`.\n", + "\n", + "We are given a function $f(a) = a + c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a + c) \\\\\n", + "&= 1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$.\n", + "\n", + "\n", + "We starts by defining the function `f(a) = a + c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `1`. This means that the gradient of `f(a)` with respect to `a` is `1`, which matches the behavior of the `AddScalar` operator as provided in the `gradient` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "3ebbb12f-03ea-4d3e-beb4-e15c0eb7fc88", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -178,13 +290,52 @@ " return AddScalar(scalar)(a)" ] }, + { + "cell_type": "markdown", + "id": "f26ed99f-b3f2-4df1-9918-ff23fc99be74", + "metadata": {}, + "source": [ + "## Element Wise Multiplication" + ] + }, + { + "cell_type": "markdown", + "id": "1bf5a7d0-8e8a-47cd-a4b5-cdc12a03649c", + "metadata": {}, + "source": [ + "Explanation for the derivative of the `EWiseMul` (element-wise multiplication) operator:\n", + "\n", + "Let's denote the two input tensors as `a` and `b`. The operation can be described as `f(a, b) = a * b`, where `*` represents element-wise multiplication.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = b` and `df/db = a`. This means that the derivative of `f(a, b)` with respect to `a` is `b`, and the derivative with respect to `b` is `a`.\n", + "\n", + "\n", + "We are given a function $f(a, b) = a \\odot b$, where $a$ and $b$ are tensors, and $\\odot$ represents element-wise multiplication. Our task is to find the derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "By differentiating the function $f(a, b)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\odot b) \\\\\n", + "&= b\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $b$.\n", + "\n", + "Similarly, by differentiating the function $f(a, b)$ with respect to $b$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{db} &= \\frac{d}{db} (a \\odot b) \\\\\n", + "&= a\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $b$ is $a$." + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "67e4faab-b6f8-4332-ba80-06564920e53c", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -224,8 +375,8 @@ " Returns:\n", " The gradients with respect to the inputs.\n", " \"\"\"\n", - " lhs, rhs = node.inputs\n", - " return out_grad * rhs, out_grad * lhs\n", + " a, b = node.inputs\n", + " return out_grad * b, out_grad * a\n", "\n", "def multiply(a: Tensor, b: Tensor) -> Tensor:\n", " \"\"\"\n", @@ -241,13 +392,44 @@ " return EWiseMul()(a, b)" ] }, + { + "cell_type": "markdown", + "id": "cdb531c5-f22c-40c9-901e-e373b837a846", + "metadata": {}, + "source": [ + "## Scalar Multiplication" + ] + }, + { + "cell_type": "markdown", + "id": "5d0246b9-ce8f-4fab-991d-7ec43745c2ea", + "metadata": {}, + "source": [ + "Let's denote the scalar as `c` and `a` as the tensor being multiplied by the scalar. The operation can be described as `f(a) = a * c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = c`, which means the derivative of `f(a)` with respect to `a` is `c`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "We are given a function $f(a) = a \\cdot c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\cdot c) \\\\\n", + "&= c\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $c$.\n", + "\n", + "We starts by defining the function `f(a) = a * c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `c`. This means that the gradient of `f(a)` with respect to `a` is `c`, which matches the behavior of the `MulScalar` operator as provided in the `gradient` method." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "64a28132-02d4-4b27-bc07-5897f88a9cd2", - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "#| export\n", @@ -310,44 +492,681 @@ " return MulScalar(scalar)(a)" ] }, + { + "cell_type": "markdown", + "id": "6497d8ab-3003-4784-a330-ad5f862b9ca5", + "metadata": {}, + "source": [ + "## Negation" + ] + }, + { + "cell_type": "markdown", + "id": "04085053-1446-4343-b720-17e04a1c4ee1", + "metadata": {}, + "source": [ + "Let's denote `a` as the tensor being negated. The operation can be described as `f(a) = -a`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = -1`.\n", + "\n", + "We are given a function $f(a) = -a$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (-a) \\\\\n", + "&= -1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $-1$." + ] + }, { "cell_type": "code", - "execution_count": 7, - "id": "9fc235f7-c1f4-42b5-b3a1-d9cb909617d9", - "metadata": { - "tags": [] - }, + "execution_count": null, + "id": "defd870b-d2e6-4212-bd4b-b3333d271c9e", + "metadata": {}, "outputs": [], "source": [ - "import nbdev; nbdev.nbdev_export()" + "class Negate(TensorOp):\n", + " \"\"\"\n", + " Negates the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> op = Negate()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([-1, 2, -3])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the negation of a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor to negate.\n", + "\n", + " Returns:\n", + " The negation of a.\n", + " \"\"\"\n", + " return -1 * a\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the negation operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (negate(out_grad), )\n", + "\n", + "\n", + "def negate(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Negates the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor to negate.\n", + "\n", + " Returns:\n", + " The negation of a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> result = negate(a)\n", + " >>> print(result)\n", + " Tensor([-1, 2, -3])\n", + " \"\"\"\n", + " return Negate()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "2e28e521-a653-45e0-a567-46c7b800d281", + "metadata": {}, + "source": [ + "## Exp" + ] + }, + { + "cell_type": "markdown", + "id": "af8bf43a-9a2a-4077-a54f-54df0c6c955d", + "metadata": {}, + "source": [ + "Explanation for the derivative of the `Exp` operator:\n", + "\n", + "Let's denote `a` as the tensor on which the exponential function is applied. The operation can be described as `f(a) = exp(a)`, where `exp` represents the exponential function.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = exp(a)`.\n", + "\n", + "We are given a function $f(a) = \\exp(a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (\\exp(a)) \\\\\n", + "&= \\exp(a)\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\exp(a)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a810bf4e-1d48-412b-bf89-2a6fde789ae4", + "metadata": {}, + "outputs": [], + "source": [ + "class Exp(TensorOp):\n", + " \"\"\"\n", + " Calculates the exponential of the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, 2, 3])\n", + " >>> op = Exp()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([2.71828183, 7.3890561, 20.08553692])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the exponential of a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The exponential of a.\n", + " \"\"\"\n", + " self.out = array_api.exp(a)\n", + " return self.out\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the exponential operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (out_grad * self.out, )\n", + "\n", + "def exp(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Calculates the exponential of the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The exponential of a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, 2, 3])\n", + " >>> result = exp(a)\n", + " >>> print(result)\n", + " Tensor([2.71828183, 7.3890561, 20.08553692])\n", + " \"\"\"\n", + " return Exp()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "45592e37-9a6d-42ec-8458-167a94394cc1", + "metadata": {}, + "source": [ + "## ReLU" + ] + }, + { + "cell_type": "markdown", + "id": "2c8aefd6-23e2-4df7-8627-f74813b8f0bc", + "metadata": {}, + "source": [ + "The derivative of the `ReLU` (Rectified Linear Unit) operator:\n", + "\n", + "Let's denote `a` as the tensor on which the ReLU function is applied. The ReLU function is defined as follows: \n", + "\n", + "$$\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1` if `a >= 0`, and `df/da = 0` if `a < 0`.\n", + "\n", + "We are given a function $f(a) = \\max(0, a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By considering the definition of the ReLU function, we can write $f(a)$ as:\n", + "\n", + "$$\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Now, let's differentiate $f(a)$ with respect to $a$:\n", + "\n", + "$$\n", + "\\frac{df}{da} = \n", + "\\begin{cases}\n", + "1, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$ if $a \\geq 0$, and $0$ if $a < 0$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f32b2ce4-7d9e-4290-90d9-a56bc59cfe2a", + "metadata": {}, + "outputs": [], + "source": [ + "class ReLU(TensorOp):\n", + " \"\"\"\n", + " Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> op = ReLU()\n", + " >>> result = op.compute(a)\n", + " >>> print(result)\n", + " Tensor([1, 0, 3])\n", + " \"\"\"\n", + " \n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the ReLU activation function on a tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The result of applying ReLU to a.\n", + " \"\"\"\n", + " self.out = array_api.clip(a, a_min=0)\n", + " return self.out\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:\n", + " \"\"\"\n", + " Computes the gradient of the ReLU operation.\n", + "\n", + " Args:\n", + " - out_grad: The gradient of the output of the operation.\n", + " - node: The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " The gradients with respect to the inputs.\n", + " \"\"\"\n", + " return (out_grad * Tensor(node.children[0] >= 0), )\n", + "\n", + "def relu(a: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Applies the ReLU (Rectified Linear Unit) activation function to the given tensor.\n", + "\n", + " Args:\n", + " - a: The tensor.\n", + "\n", + " Returns:\n", + " The result of applying ReLU to a.\n", + " \n", + " Example:\n", + " >>> a = Tensor([1, -2, 3])\n", + " >>> result = relu(a)\n", + " >>> print(result)\n", + " Tensor([1, 0, 3])\n", + " \"\"\"\n", + " return ReLU()(a)\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e9f074a-e23c-4e8a-8ab7-2cfba344c461", + "metadata": {}, + "source": [ + "## Power Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "40d5e875-0981-4243-86b5-d6ca6b117d5e", + "metadata": {}, + "source": [ + "The derivative of the `PowerScalar` operator:\n", + "\n", + "Let's denote the scalar as `n` and `a` as the tensor being raised to the power of the scalar. The operation can be described as `f(a) = a^n`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = n * a^(n-1)`.\n", + "\n", + "We are given a function $f(a) = a^n$, where $a$ is a tensor and $n$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a^n) \\\\\n", + "&= n \\cdot a^{n-1}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $n \\cdot a^{n-1}$." ] }, { "cell_type": "code", "execution_count": null, - "id": "b652622e-e158-43e7-a32a-aa92c13d85cb", + "id": "364103cb-615b-4359-8061-5a8bd1455367", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "class PowerScalar(TensorOp):\n", + " \"\"\"\n", + " The PowerScalar operation raises a tensor to an (integer) power.\n", + "\n", + " Attributes:\n", + " scalar (int): The power to raise the tensor to.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> tensor = Tensor(np.array([1, 2, 3]))\n", + " >>> pow_scalar = PowerScalar(2)\n", + " >>> result = pow_scalar.compute(tensor.data)\n", + " >>> print(result)\n", + " array([1, 4, 9])\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, scalar: int):\n", + " \"\"\"\n", + " Constructs the PowerScalar operation.\n", + "\n", + " Args:\n", + " scalar (int): The power to raise the tensor to.\n", + " \"\"\"\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the power operation on the input tensor.\n", + "\n", + " Args:\n", + " a (NDArray): The input tensor.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after the power operation.\n", + " \"\"\"\n", + " return array_api.power(a, self.scalar)\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ]:\n", + " \"\"\"\n", + " Computes the gradient of the power operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, ]: The gradient with respect to the input tensor.\n", + " \"\"\"\n", + " a = node.children[0]\n", + " return (self.scalar * power_scalar(a, self.scalar - 1) * out_grad, )\n", + "\n", + "\n", + "def power_scalar(a: Tensor, scalar: int) -> Tensor:\n", + " \"\"\"\n", + " Raises a tensor to a power.\n", + "\n", + " Args:\n", + " a (Tensor): The input tensor.\n", + " scalar (int): The power to raise the tensor to.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after the power operation.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> tensor = Tensor(np.array([1, 2, 3]))\n", + " >>> result = power_scalar(tensor, 2)\n", + " >>> print(result)\n", + " Tensor([1, 4, 9])\n", + " \"\"\"\n", + " return PowerScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "0cbb40d3-1ed4-4a7f-9d74-b39b70187860", + "metadata": {}, + "source": [ + "## Element Wise Divide" + ] + }, + { + "cell_type": "markdown", + "id": "24a7c7e2-d71d-49c0-9c45-ffc7f0830334", + "metadata": {}, + "source": [ + "The operation described here is an element-wise division of two tensors, `a` and `b`, where the operation can be described as `f(a, b) = a / b`. \n", + "\n", + "We'll compute the partial derivatives with respect to `a` and `b`:\n", + "\n", + "1. The partial derivative of `f(a, b)` with respect to `a` (`df/da`) is `1/b`.\n", + "\n", + "2. The partial derivative of `f(a, b)` with respect to `b` (`df/db`) is `-a / b^2`.\n", + "\n", + "We are given a function $f(a, b) = \\frac{a}{b}$, where $a$ and $b$ are tensors. Our task is to find the partial derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "Let's start with $\\frac{\\partial f}{\\partial a}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial a} &= \\frac{\\partial}{\\partial a} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= \\frac{1}{b}\n", + "\\end{align*}\n", + "\n", + "Now, let's compute $\\frac{\\partial f}{\\partial b}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial b} &= \\frac{\\partial}{\\partial b} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= - \\frac{a}{b^{2}}\n", + "\\end{align*}\n", + "\n", + "Here is a detailed derivative:\n", + "\n", + "Given a function of the form $y = \\frac{u}{v}$, where both $u$ and $v$ are functions of $x$, the quotient rule of differentiation states:\n", + "\n", + "$$\\frac{dy}{dx} = \\frac{v \\cdot \\frac{du}{dx} - u \\cdot \\frac{dv}{dx}}{v^2}$$\n", + "\n", + "In our case, we're looking at the function $y = \\frac{a}{b}$, where $a$ and $b$ are tensors. We want to find the derivative with respect to $b$ (instead of $x$ in our general formula). So we have:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot \\frac{da}{db} - a \\cdot \\frac{db}{db}}{b^2}$$\n", + "\n", + "Since $a$ does not depend on $b$, $\\frac{da}{db} = 0$, and since any variable is equal to itself, $\\frac{db}{db} = 1$. \n", + "\n", + "So the derivative $\\frac{dy}{db}$ simplifies to:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot 0 - a \\cdot 1}{b^2}$$\n", + "\n", + "Therefore, the derivative of $y$ with respect to $b$ is $-\\frac{a}{b^2}$.\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $\\frac{1}{b}$, and the gradient of $f(a, b)$ with respect to $b$ is $- \\frac{a}{b^{2}}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d841c0bd-e4ad-4dd8-add8-0a4626a774fe", + "metadata": {}, + "outputs": [], + "source": [ + "class EWiseDiv(TensorOp):\n", + " \"\"\"\n", + " The EWiseDiv operation divides two tensors element-wise.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> b = Tensor(np.array([4, 5, 6]))\n", + " >>> div = EWiseDiv()\n", + " >>> result = div.compute(a.data, b.data)\n", + " >>> print(result)\n", + " array([0.25, 0.4, 0.5])\n", + "\n", + " \"\"\"\n", + "\n", + " def compute(self, a: NDArray, b: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Computes the element-wise division of two tensors.\n", + "\n", + " Args:\n", + " a (NDArray): The dividend tensor.\n", + " b (NDArray): The divisor tensor.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after element-wise division.\n", + " \"\"\"\n", + " return a / b\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, Tensor]:\n", + " \"\"\"\n", + " Computes the gradient of the element-wise division operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, Tensor]: The gradients with respect to the dividend and divisor tensors.\n", + " \"\"\"\n", + " a, b = node.inputs\n", + " return divide(out_grad, b), out_grad * negate(divide(a, power_scalar(b, 2)))\n", + "\n", + "\n", + "def divide(a: Tensor, b: Tensor) -> Tensor:\n", + " \"\"\"\n", + " Divides two tensors element-wise.\n", + "\n", + " Args:\n", + " a (Tensor): The dividend tensor.\n", + " b (Tensor): The divisor tensor.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after element-wise division.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> b = Tensor(np.array([4, 5, 6]))\n", + " >>> result = divide(a, b)\n", + " >>> print(result)\n", + " Tensor([0.25, 0.4, 0.5])\n", + " \"\"\"\n", + " return EWiseDiv()(a, b)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2f7daf46-5e14-4bf7-9f7f-81947e7e7cd3", + "metadata": {}, + "source": [ + "## Divide Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "87d98d2f-34a5-4744-994a-9c9264bfb4a9", + "metadata": {}, + "source": [ + "Let's denote the scalar as `c`, and `a` as the tensor being divided by the scalar. The operation can be described as `f(a) = a / c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1/c`.\n", + "\n", + "This is the derivative of `f(a)` with respect to `a`.\n", + "\n", + "We are given a function $f(a) = \\frac{a}{c}$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By using the power rule of differentiation, where the derivative of $a^n$ is $n \\cdot a^{n-1}$, we can rewrite $f(a)$ as $f(a) = c^{-1}a$. \n", + "\n", + "Now, we can differentiate this with respect to $a$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (c^{-1}a) \\\\\n", + "&= c^{-1} \\frac{d}{da} (a) \\\\\n", + "&= c^{-1} \\\\\n", + "&= \\frac{1}{c}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\frac{1}{c}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbceaf89-d19a-43e2-9ce2-3a9c093612ca", + "metadata": {}, + "outputs": [], + "source": [ + "class DivScalar(TensorOp):\n", + " \"\"\"\n", + " The DivScalar operation divides a tensor by a scalar.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> scalar = 2\n", + " >>> div_scalar = DivScalar(scalar)\n", + " >>> result = div_scalar.compute(a.data)\n", + " >>> print(result)\n", + " array([0.5, 1.0, 1.5])\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, scalar: Union[int, float]):\n", + " \"\"\"\n", + " Initialize the DivScalar operation with the scalar to divide by.\n", + "\n", + " Args:\n", + " scalar (int, float): The scalar to divide the tensor by.\n", + " \"\"\"\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray) -> NDArray:\n", + " \"\"\"\n", + " Divides the tensor by the scalar.\n", + "\n", + " Args:\n", + " a (NDArray): The tensor to divide.\n", + "\n", + " Returns:\n", + " NDArray: The resulting tensor after division.\n", + " \"\"\"\n", + " return a / self.scalar\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ...]:\n", + " \"\"\"\n", + " Computes the gradient of the division operation.\n", + "\n", + " Args:\n", + " out_grad (Tensor): The gradient of the output tensor.\n", + " node (Tensor): The node in the computational graph where the operation was performed.\n", + "\n", + " Returns:\n", + " Tuple[Tensor, ...]: The gradient with respect to the tensor.\n", + " \"\"\"\n", + " return (out_grad / self.scalar, )\n", + "\n", + "def divide_scalar(a: Tensor, scalar: Union[int, float]) -> Tensor:\n", + " \"\"\"\n", + " Divides a tensor by a scalar.\n", + "\n", + " Args:\n", + " a (Tensor): The tensor to divide.\n", + " scalar (int, float): The scalar to divide the tensor by.\n", + "\n", + " Returns:\n", + " Tensor: The resulting tensor after division.\n", + "\n", + " Example:\n", + " >>> import numpy as np\n", + " >>> a = Tensor(np.array([1, 2, 3]))\n", + " >>> scalar = 2\n", + " >>> result = divide_scalar(a, scalar)\n", + " >>> print(result)\n", + " Tensor([0.5, 1.0, 1.5])\n", + " \"\"\"\n", + " return DivScalar(scalar)(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fc235f7-c1f4-42b5-b3a1-d9cb909617d9", + "metadata": {}, + "outputs": [], + "source": [ + "import nbdev; nbdev.nbdev_export()" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" } }, "nbformat": 4, diff --git a/ops.ipynb b/ops.ipynb new file mode 100644 index 0000000..3962fff --- /dev/null +++ b/ops.ipynb @@ -0,0 +1,2197 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "30e46e7a-8cc6-4dd5-8536-e4f17a480d58", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "ImportError", + "evalue": "attempted relative import with no known parent package", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumbers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Number\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optional, List\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mautograd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NDArray\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mautograd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Op, Tensor, Value, TensorOp\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mautograd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TensorTuple, TensorTupleOp\n", + "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" + ] + } + ], + "source": [ + "from numbers import Number\n", + "from typing import Optional, List\n", + "from .autograd import NDArray\n", + "from .autograd import Op, Tensor, Value, TensorOp\n", + "from .autograd import TensorTuple, TensorTupleOp\n", + "import numpy\n", + "\n", + "# NOTE: we will import numpy as the array_api\n", + "# as the backend for our computations, this line will change in later homeworks\n", + "import numpy as array_api" + ] + }, + { + "cell_type": "markdown", + "id": "b7e1bd11-e64e-4403-812f-cbc0e81aa9b6", + "metadata": { + "tags": [] + }, + "source": [ + "## AutoGrad" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "c3fd0804-dbc1-472e-a73d-70c2d58f8e28", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import List, Optional, NamedTuple, Tuple, Union\n", + "from collections import namedtuple\n", + "import numpy\n", + "\n", + "# needle version\n", + "LAZY_MODE = False\n", + "TENSOR_COUNTER = 0\n", + "\n", + "# NOTE: we will import numpy as the array_api\n", + "# as the backend for our computations, this line will change in later homeworks\n", + "import numpy as array_api\n", + "NDArray = numpy.ndarray\n", + "\n", + "\n", + "class Device:\n", + " \"\"\"Indicates the device supporting an NDArray.\"\"\"\n", + "\n", + "\n", + "class CPUDevice(Device):\n", + " \"\"\"Represents data that sits in CPU\"\"\"\n", + "\n", + " def __repr__(self):\n", + " return \"needle.cpu()\"\n", + "\n", + " def __hash__(self):\n", + " return self.__repr__().__hash__()\n", + "\n", + " def __eq__(self, other):\n", + " return isinstance(other, CPUDevice)\n", + "\n", + " def enabled(self):\n", + " return True\n", + "\n", + "def cpu():\n", + " \"\"\"Return cpu device\"\"\"\n", + " return CPUDevice()\n", + "\n", + "def all_devices():\n", + " \"\"\"return a list of all available devices\"\"\"\n", + " return [cpu()]\n", + "\n", + "\n", + "class Op:\n", + " \"\"\"Operator definition.\"\"\"\n", + "\n", + " def __call__(self, *args):\n", + " raise NotImplementedError()\n", + "\n", + " def compute(self, *args: Tuple[NDArray]):\n", + " \"\"\"Calculate forward pass of operator.\n", + "\n", + " Parameters\n", + " ----------\n", + " input: np.ndarray\n", + " A list of input arrays to the function\n", + "\n", + " Returns\n", + " -------\n", + " output: nd.array\n", + " Array output of the operation\n", + "\n", + " \"\"\"\n", + " raise NotImplementedError()\n", + "\n", + " def gradient(\n", + " self, out_grad: \"Value\", node: \"Value\"\n", + " ) -> Union[\"Value\", Tuple[\"Value\"]]:\n", + " \"\"\"Compute partial adjoint for each input value for a given output adjoint.\n", + "\n", + " Parameters\n", + " ----------\n", + " out_grad: Value\n", + " The adjoint wrt to the output value.\n", + "\n", + " node: Value\n", + " The value node of forward evaluation.\n", + "\n", + " Returns\n", + " -------\n", + " input_grads: Value or Tuple[Value]\n", + " A list containing partial gradient adjoints to be propagated to\n", + " each of the input node.\n", + " \"\"\"\n", + " raise NotImplementedError()\n", + "\n", + " def gradient_as_tuple(self, out_grad: \"Value\", node: \"Value\") -> Tuple[\"Value\"]:\n", + " \"\"\" Convenience method to always return a tuple from gradient call\"\"\"\n", + " output = self.gradient(out_grad, node)\n", + " if isinstance(output, tuple):\n", + " return output\n", + " elif isinstance(output, list):\n", + " return tuple(output)\n", + " else:\n", + " return (output,)\n", + "\n", + "\n", + "class TensorOp(Op):\n", + " \"\"\" Op class specialized to output tensors, will be alternate subclasses for other structures \"\"\"\n", + "\n", + " def __call__(self, *args):\n", + " return Tensor.make_from_op(self, args)\n", + "\n", + "\n", + "class TensorTupleOp(Op):\n", + " \"\"\"Op class specialized to output TensorTuple\"\"\"\n", + "\n", + " def __call__(self, *args):\n", + " return TensorTuple.make_from_op(self, args)\n", + "\n", + "\n", + "class Value:\n", + " \"\"\"A value in the computational graph.\"\"\"\n", + "\n", + " # trace of computational graph\n", + " op: Optional[Op]\n", + " inputs: List[\"Value\"]\n", + " # The following fields are cached fields for\n", + " # dynamic computation\n", + " cached_data: NDArray\n", + " requires_grad: bool\n", + "\n", + " def realize_cached_data(self):\n", + " \"\"\"Run compute to realize the cached data\"\"\"\n", + " # avoid recomputation\n", + " if self.cached_data is not None:\n", + " return self.cached_data\n", + " # note: data implicitly calls realized cached data\n", + " self.cached_data = self.op.compute(\n", + " *[x.realize_cached_data() for x in self.inputs]\n", + " )\n", + " return self.cached_data\n", + "\n", + " def is_leaf(self):\n", + " return self.op is None\n", + "\n", + " def __del__(self):\n", + " global TENSOR_COUNTER\n", + " TENSOR_COUNTER -= 1\n", + "\n", + " def _init(\n", + " self,\n", + " op: Optional[Op],\n", + " inputs: List[\"Tensor\"],\n", + " *,\n", + " num_outputs: int = 1,\n", + " cached_data: List[object] = None,\n", + " requires_grad: Optional[bool] = None\n", + " ):\n", + " global TENSOR_COUNTER\n", + " TENSOR_COUNTER += 1\n", + " if requires_grad is None:\n", + " requires_grad = any(x.requires_grad for x in inputs)\n", + " self.op = op\n", + " self.inputs = inputs\n", + " self.num_outputs = num_outputs\n", + " self.cached_data = cached_data\n", + " self.requires_grad = requires_grad\n", + "\n", + " @classmethod\n", + " def make_const(cls, data, *, requires_grad=False):\n", + " value = cls.__new__(cls)\n", + " value._init(\n", + " None,\n", + " [],\n", + " cached_data=data,\n", + " requires_grad=requires_grad,\n", + " )\n", + " return value\n", + "\n", + " @classmethod\n", + " def make_from_op(cls, op: Op, inputs: List[\"Value\"]):\n", + " value = cls.__new__(cls)\n", + " value._init(op, inputs)\n", + "\n", + " if not LAZY_MODE:\n", + " if not value.requires_grad:\n", + " return value.detach()\n", + " value.realize_cached_data()\n", + " return value\n", + "\n", + "\n", + "### Not needed in HW1\n", + "class TensorTuple(Value):\n", + " \"\"\"Represent a tuple of tensors.\n", + "\n", + " To keep things simple, we do not support nested tuples.\n", + " \"\"\"\n", + "\n", + " def __len__(self):\n", + " cdata = self.realize_cached_data()\n", + " return len(cdata)\n", + "\n", + " def __getitem__(self, index: int):\n", + " return needle.ops.tuple_get_item(self, index)\n", + "\n", + " def tuple(self):\n", + " return tuple([x for x in self])\n", + "\n", + " def __repr__(self):\n", + " return \"needle.TensorTuple\" + str(self.tuple())\n", + "\n", + " def __str__(self):\n", + " return self.__repr__()\n", + "\n", + " def __add__(self, other):\n", + " assert isinstance(other, TensorTuple)\n", + " assert len(self) == len(other)\n", + " return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))])\n", + "\n", + " def detach(self):\n", + " \"\"\"Create a new tensor that shares the data but detaches from the graph.\"\"\"\n", + " return Tuple.make_const(self.realize_cached_data())\n", + "\n", + "\n", + "class Tensor(Value):\n", + " grad: \"Tensor\"\n", + "\n", + " def __init__(\n", + " self,\n", + " array,\n", + " *,\n", + " device: Optional[Device] = None,\n", + " dtype=None,\n", + " requires_grad=True,\n", + " **kwargs\n", + " ):\n", + " if isinstance(array, Tensor):\n", + " if device is None:\n", + " device = array.device\n", + " if dtype is None:\n", + " dtype = array.dtype\n", + " if device == array.device and dtype == array.dtype:\n", + " cached_data = array.realize_cached_data()\n", + " else:\n", + " # fall back, copy through numpy conversion\n", + " cached_data = Tensor._array_from_numpy(\n", + " array.numpy(), device=device, dtype=dtype\n", + " )\n", + " else:\n", + " device = device if device else cpu()\n", + " cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype)\n", + "\n", + " self._init(\n", + " None,\n", + " [],\n", + " cached_data=cached_data,\n", + " requires_grad=requires_grad,\n", + " )\n", + "\n", + " @staticmethod\n", + " def _array_from_numpy(numpy_array, device, dtype):\n", + " if array_api is numpy:\n", + " return numpy.array(numpy_array, dtype=dtype)\n", + " return array_api.array(numpy_array, device=device, dtype=dtype)\n", + "\n", + " @staticmethod\n", + " def make_from_op(op: Op, inputs: List[\"Value\"]):\n", + " tensor = Tensor.__new__(Tensor)\n", + " tensor._init(op, inputs)\n", + " if not LAZY_MODE:\n", + " tensor.realize_cached_data()\n", + " return tensor\n", + "\n", + " @staticmethod\n", + " def make_const(data, requires_grad=False):\n", + " tensor = Tensor.__new__(Tensor)\n", + " tensor._init(\n", + " None,\n", + " [],\n", + " cached_data=data\n", + " if not isinstance(data, Tensor)\n", + " else data.realize_cached_data(),\n", + " requires_grad=requires_grad,\n", + " )\n", + " return tensor\n", + "\n", + " @property\n", + " def data(self):\n", + " return self.detach()\n", + "\n", + " @data.setter\n", + " def data(self, value):\n", + " assert isinstance(value, Tensor)\n", + " assert value.dtype == self.dtype, \"%s %s\" % (\n", + " value.dtype,\n", + " self.dtype,\n", + " )\n", + " self.cached_data = value.realize_cached_data()\n", + "\n", + " def detach(self):\n", + " \"\"\"Create a new tensor that shares the data but detaches from the graph.\"\"\"\n", + " return Tensor.make_const(self.realize_cached_data())\n", + "\n", + " @property\n", + " def shape(self):\n", + " return self.realize_cached_data().shape\n", + "\n", + " @property\n", + " def dtype(self):\n", + " return self.realize_cached_data().dtype\n", + "\n", + " @property\n", + " def device(self):\n", + " data = self.realize_cached_data()\n", + " # numpy array always sits on cpu\n", + " if array_api is numpy:\n", + " return cpu()\n", + " return data.device\n", + "\n", + " def backward(self, out_grad=None):\n", + " out_grad = out_grad if out_grad else Tensor(numpy.ones(self.shape))\n", + " compute_gradient_of_variables(self, out_grad)\n", + "\n", + " def __repr__(self):\n", + " return \"needle.Tensor(\" + str(self.realize_cached_data()) + \")\"\n", + "\n", + " def __str__(self):\n", + " return self.realize_cached_data().__str__()\n", + "\n", + " def numpy(self):\n", + " data = self.realize_cached_data()\n", + " if array_api is numpy:\n", + " return data\n", + " return data.numpy()\n", + "\n", + " def __add__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return needle.ops.EWiseAdd()(self, other)\n", + " else:\n", + " return needle.ops.AddScalar(other)(self)\n", + "\n", + " def __mul__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return needle.ops.EWiseMul()(self, other)\n", + " else:\n", + " return needle.ops.MulScalar(other)(self)\n", + "\n", + " def __pow__(self, other):\n", + " if isinstance(other, Tensor):\n", + " raise NotImplementedError()\n", + " else:\n", + " return needle.ops.PowerScalar(other)(self)\n", + "\n", + " def __sub__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other))\n", + " else:\n", + " return needle.ops.AddScalar(-other)(self)\n", + "\n", + " def __truediv__(self, other):\n", + " if isinstance(other, Tensor):\n", + " return needle.ops.EWiseDiv()(self, other)\n", + " else:\n", + " return needle.ops.DivScalar(other)(self)\n", + "\n", + " def __matmul__(self, other):\n", + " return needle.ops.MatMul()(self, other)\n", + "\n", + " def matmul(self, other):\n", + " return needle.ops.MatMul()(self, other)\n", + "\n", + " def sum(self, axes=None):\n", + " return needle.ops.Summation(axes)(self)\n", + "\n", + " def broadcast_to(self, shape):\n", + " return needle.ops.BroadcastTo(shape)(self)\n", + "\n", + " def reshape(self, shape):\n", + " return needle.ops.Reshape(shape)(self)\n", + "\n", + " def __neg__(self):\n", + " return needle.ops.Negate()(self)\n", + "\n", + " def transpose(self, axes=None):\n", + " return needle.ops.Transpose(axes)(self)\n", + "\n", + " __radd__ = __add__\n", + " __rmul__ = __mul__\n", + " __rsub__ = __sub__\n", + " __rmatmul__ = __matmul__\n", + "\n", + "\n", + "def compute_gradient_of_variables(output_tensor, out_grad):\n", + " \"\"\"Take gradient of output node with respect to each node in node_list.\n", + "\n", + " Store the computed result in the grad field of each Variable.\n", + " \"\"\"\n", + " # a map from node to a list of gradient contributions from each output node\n", + " node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {}\n", + " # Special note on initializing gradient of\n", + " # We are really taking a derivative of the scalar reduce_sum(output_node)\n", + " # instead of the vector output_node. But this is the common case for loss function.\n", + " node_to_output_grads_list[output_tensor] = [out_grad]\n", + "\n", + " # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.\n", + " reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))\n", + "\n", + " ### BEGIN YOUR SOLUTION\n", + " \n", + " for node in reverse_topo_order:\n", + " node.gradient(out_grad, output_tensor)\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def find_topo_sort(node_list: List[Value]) -> List[Value]:\n", + " \"\"\"Given a list of nodes, return a topological sort list of nodes ending in them.\n", + "\n", + " A simple algorithm is to do a post-order DFS traversal on the given nodes,\n", + " going backwards based on input edges. Since a node is added to the ordering\n", + " after all its predecessors are traversed due to post-order DFS, we get a topological\n", + " sort.\n", + " \"\"\"\n", + " topo = []\n", + " visited = set()\n", + "\n", + " for node in node.inputs:\n", + " if node not in visited: topo_sort_dfs(node, visited, topo)\n", + " \n", + " topo_order.reverse() # Reverse the list to get the correct topological order\n", + " return topo_order\n", + "\n", + "\n", + "def topo_sort_dfs(node, visited, topo_order):\n", + " \"\"\"Post-order DFS\"\"\"\n", + " ### BEGIN YOUR SOLUTION\n", + " \n", + " visited.add(node)\n", + " for child in node.inputs:\n", + " if child not in visited:\n", + " topo_sort_dfs(child, visited, topo_order)\n", + " topo_order.append(node)\n", + " \n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "##############################\n", + "####### Helper Methods #######\n", + "##############################\n", + "\n", + "\n", + "def sum_node_list(node_list):\n", + " \"\"\"Custom sum function in order to avoid create redundant nodes in Python sum implementation.\"\"\"\n", + " from operator import add\n", + " from functools import reduce\n", + "\n", + " return reduce(add, node_list)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "d496c44c-c6b5-45eb-9f2d-1f5c213bf5b8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'ndl' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[112], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Test case 1\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m a1, b1 \u001b[38;5;241m=\u001b[39m \u001b[43mndl\u001b[49m\u001b[38;5;241m.\u001b[39mTensor(np\u001b[38;5;241m.\u001b[39masarray([[\u001b[38;5;241m0.88282157\u001b[39m]])), ndl\u001b[38;5;241m.\u001b[39mTensor(np\u001b[38;5;241m.\u001b[39masarray([[\u001b[38;5;241m0.90170084\u001b[39m]]))\n\u001b[1;32m 3\u001b[0m c1 \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m3\u001b[39m\u001b[38;5;241m*\u001b[39ma1\u001b[38;5;241m*\u001b[39ma1 \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m4\u001b[39m\u001b[38;5;241m*\u001b[39mb1\u001b[38;5;241m*\u001b[39ma1 \u001b[38;5;241m-\u001b[39m a1\n\u001b[1;32m 5\u001b[0m soln \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray([np\u001b[38;5;241m.\u001b[39marray([[\u001b[38;5;241m0.88282157\u001b[39m]]),\n\u001b[1;32m 6\u001b[0m np\u001b[38;5;241m.\u001b[39marray([[\u001b[38;5;241m2.64846471\u001b[39m]]),\n\u001b[1;32m 7\u001b[0m np\u001b[38;5;241m.\u001b[39marray([[\u001b[38;5;241m2.33812177\u001b[39m]]),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m np\u001b[38;5;241m.\u001b[39marray([[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m0.88282157\u001b[39m]]),\n\u001b[1;32m 13\u001b[0m np\u001b[38;5;241m.\u001b[39marray([[\u001b[38;5;241m4.63946401\u001b[39m]])])\n", + "\u001b[0;31mNameError\u001b[0m: name 'ndl' is not defined" + ] + } + ], + "source": [ + "# Test case 1\n", + "a1, b1 = ndl.Tensor(np.asarray([[0.88282157]])), ndl.Tensor(np.asarray([[0.90170084]]))\n", + "c1 = 3*a1*a1 + 4*b1*a1 - a1\n", + "\n", + "soln = np.array([np.array([[0.88282157]]),\n", + " np.array([[2.64846471]]),\n", + " np.array([[2.33812177]]),\n", + " np.array([[0.90170084]]),\n", + " np.array([[3.60680336]]),\n", + " np.array([[3.1841638]]),\n", + " np.array([[5.52228558]]),\n", + " np.array([[-0.88282157]]),\n", + " np.array([[4.63946401]])])\n", + "\n", + "topo_order = np.array([x.numpy() for x in ndl.autograd.find_topo_sort([c1])])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d734fcb-e1d8-4f1f-a08e-6c33d781b917", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(soln) == len(topo_order)\n", + "np.testing.assert_allclose(topo_order, soln, rtol=1e-06, atol=1e-06)" + ] + }, + { + "cell_type": "markdown", + "id": "4756c7c0-cb8e-454e-99c4-c0d8702d3dfc", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## Element Wise Addition" + ] + }, + { + "cell_type": "markdown", + "id": "c1a04d16-43de-4de2-9df5-512a56f8be2f", + "metadata": {}, + "source": [ + "Let's walk through the step-by-step derivative calculation for the `EWiseAdd` operation:\n", + "\n", + "We have the function `f(a, b) = a + b`, where `a` and `b` are tensors. Our goal is to compute the partial derivatives with respect to `a` and `b`.\n", + "\n", + "Let's start by calculating the derivative of `f` with respect to `a`, denoted as `df/da`:\n", + "\n", + "Step 1: Compute the derivative of `f` with respect to `a`.\n", + "\n", + "$\\frac{{\\partial f}}{{\\partial a}} = \\frac{{\\partial}}{{\\partial a}} (a + b)$\n", + "\n", + "Since `a` is the variable we are differentiating with respect to, the derivative of `a` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial a}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial a}} = 1.$$\n", + "\n", + "Step 2: Compute the derivative of `f` with respect to `b`.\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = \\frac{{\\partial}}{{\\partial b}} (a + b)$$\n", + "\n", + "Again, since `b` is the variable we are differentiating with respect to, the derivative of `b` with respect to itself is 1:\n", + "\n", + "$$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Therefore, $$\\frac{{\\partial f}}{{\\partial b}} = 1$$\n", + "\n", + "Hence, the partial derivatives of `f(a, b) = a + b` with respect to `a` and `b` are both equal to 1." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "448fccc7-03d9-4996-a2f2-62b182adaa0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class EWiseAdd(TensorOp):\n", + " def compute(self, a: NDArray, b: NDArray):\n", + " return a + b\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor):\n", + " return out_grad, out_grad\n", + "\n", + "\n", + "def add(a, b):\n", + " return EWiseAdd()(a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "ef42c9e2-8e29-4199-bffa-7967d7820eed", + "metadata": { + "tags": [] + }, + "source": [ + "## Scalar Addition" + ] + }, + { + "cell_type": "markdown", + "id": "64d90354-3519-49b9-9d07-538896f66ef4", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `AddScalar` operator:\n", + "\n", + "Let's denote the scalar as `c` and `a` as the tensor being added by the scalar. The operation can be described as `f(a) = a + c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1`, which means the derivative of `f(a)` with respect to `a` is simply `1`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "We are given a function $f(a) = a + c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a + c) \\\\\n", + "&= 1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$.\n", + "\n", + "\n", + "We starts by defining the function `f(a) = a + c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `1`. This means that the gradient of `f(a)` with respect to `a` is `1`, which matches the behavior of the `AddScalar` operator as provided in the `gradient` method." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9afe2bcd-25b6-48df-b98a-143f5baccc6e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class AddScalar(TensorOp):\n", + " def __init__(self, scalar):\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray):\n", + " return a + self.scalar\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor):\n", + " return out_grad\n", + "\n", + "\n", + "def add_scalar(a, scalar):\n", + " return AddScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "e89a5335-0b34-41c1-9011-129983cb9ff8", + "metadata": { + "tags": [] + }, + "source": [ + "## Element Wise Mult" + ] + }, + { + "cell_type": "markdown", + "id": "ad98e363-13c8-46ea-a1c3-f4c27d87d617", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `EWiseMul` (element-wise multiplication) operator:\n", + "\n", + "Let's denote the two input tensors as `a` and `b`. The operation can be described as `f(a, b) = a * b`, where `*` represents element-wise multiplication.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = b` and `df/db = a`. This means that the derivative of `f(a, b)` with respect to `a` is `b`, and the derivative with respect to `b` is `a`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a, b) = a \\odot b$, where $a$ and $b$ are tensors, and $\\odot$ represents element-wise multiplication. Our task is to find the derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "By differentiating the function $f(a, b)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\odot b) \\\\\n", + "&= b\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $b$.\n", + "\n", + "Similarly, by differentiating the function $f(a, b)$ with respect to $b$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{db} &= \\frac{d}{db} (a \\odot b) \\\\\n", + "&= a\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $b$ is $a$." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d7164447-a850-4047-b89b-b94c1ca77487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class EWiseMul(TensorOp):\n", + " def compute(self, a: NDArray, b: NDArray):\n", + " return a * b\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor):\n", + " lhs, rhs = node.inputs\n", + " return out_grad * rhs, out_grad * lhs\n", + "\n", + "\n", + "def multiply(a, b):\n", + " return EWiseMul()(a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "a19e45b7-0fca-485e-9771-aefef3cf6d09", + "metadata": { + "tags": [] + }, + "source": [ + "## Scalar Mult" + ] + }, + { + "cell_type": "markdown", + "id": "7f549c16-93ba-48b1-b87f-7507a35923c7", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `MulScalar` operator:\n", + "\n", + "Let's denote the scalar as `c` and `a` as the tensor being multiplied by the scalar. The operation can be described as `f(a) = a * c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = c`, which means the derivative of `f(a)` with respect to `a` is `c`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "We are given a function $f(a) = a \\cdot c$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a \\cdot c) \\\\\n", + "&= c\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $c$.\n", + "\n", + "We starts by defining the function `f(a) = a * c`. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `c`. This means that the gradient of `f(a)` with respect to `a` is `c`, which matches the behavior of the `MulScalar` operator as provided in the `gradient` method." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b6228d02-df05-489d-a2af-e645fe38562b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class MulScalar(TensorOp):\n", + " def __init__(self, scalar):\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray):\n", + " return a * self.scalar\n", + "\n", + " def gradient(self, out_grad: Tensor, node: Tensor):\n", + " return (out_grad * self.scalar,)\n", + "\n", + "\n", + "def mul_scalar(a, scalar):\n", + " return MulScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "9e262d60-8c17-4cd2-83e5-43cf668f0a3b", + "metadata": { + "tags": [] + }, + "source": [ + "## Power Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "507873e9-8c2f-49be-8e3f-b7eba5954ff6", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `PowerScalar` operator:\n", + "\n", + "Let's denote the scalar as `n` and `a` as the tensor being raised to the power of the scalar. The operation can be described as `f(a) = a^n`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = n * a^(n-1)`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a) = a^n$, where $a$ is a tensor and $n$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (a^n) \\\\\n", + "&= n \\cdot a^{n-1}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $n \\cdot a^{n-1}$.\n", + "\n", + "We starts by defining the function `f(a) = a^n`, where `^` represents exponentiation. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `n * a^(n-1)`. This means that the gradient of `f(a)` with respect to `a` is `n * a^(n-1)`, which matches the behavior of the `PowerScalar` operator as provided in the `gradient` method." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "83d17d97-f9a7-4398-ba64-ba7c8b83348a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class PowerScalar(TensorOp):\n", + " \"\"\"Op raise a tensor to an (integer) power.\"\"\"\n", + "\n", + " def __init__(self, scalar: int):\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a: NDArray) -> NDArray:\n", + " ### BEGIN YOUR SOLUTION\n", + " return array_api.power(a, self.scalar)\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return (self.scalar * power_scalar(node, self.scalar - 1) * out_grad, )\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def power_scalar(a, scalar):\n", + " return PowerScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "9eecaccd-1dae-447b-b76b-c8f8f0a339a8", + "metadata": { + "tags": [] + }, + "source": [ + "## Element Wise Divide" + ] + }, + { + "cell_type": "markdown", + "id": "aac973c7-595c-43be-a0c6-27793df1094c", + "metadata": {}, + "source": [ + "The operation described here is an element-wise division of two tensors, `a` and `b`, where the operation can be described as `f(a, b) = a / b`. \n", + "\n", + "We'll compute the partial derivatives with respect to `a` and `b`:\n", + "\n", + "1. The partial derivative of `f(a, b)` with respect to `a` (`df/da`) is `1/b`.\n", + "\n", + "2. The partial derivative of `f(a, b)` with respect to `b` (`df/db`) is `-a / b^2`.\n", + "\n", + "These results align with the backward function you've provided.\n", + "\n", + "Now, let's translate this into LaTeX:\n", + "\n", + "\n", + "We are given a function $f(a, b) = \\frac{a}{b}$, where $a$ and $b$ are tensors. Our task is to find the partial derivatives of this function with respect to $a$ and $b$.\n", + "\n", + "Let's start with $\\frac{\\partial f}{\\partial a}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial a} &= \\frac{\\partial}{\\partial a} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= \\frac{1}{b}\n", + "\\end{align*}\n", + "\n", + "Now, let's compute $\\frac{\\partial f}{\\partial b}$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{\\partial f}{\\partial b} &= \\frac{\\partial}{\\partial b} \\left(\\frac{a}{b}\\right) \\\\\n", + "&= - \\frac{a}{b^{2}}\n", + "\\end{align*}\n", + "\n", + "Here is a detailed derivative:\n", + "\n", + "Given a function of the form $y = \\frac{u}{v}$, where both $u$ and $v$ are functions of $x$, the quotient rule of differentiation states:\n", + "\n", + "$$\\frac{dy}{dx} = \\frac{v \\cdot \\frac{du}{dx} - u \\cdot \\frac{dv}{dx}}{v^2}$$\n", + "\n", + "In our case, we're looking at the function $y = \\frac{a}{b}$, where $a$ and $b$ are tensors. We want to find the derivative with respect to $b$ (instead of $x$ in our general formula). So we have:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot \\frac{da}{db} - a \\cdot \\frac{db}{db}}{b^2}$$\n", + "\n", + "Since $a$ does not depend on $b$, $\\frac{da}{db} = 0$, and since any variable is equal to itself, $\\frac{db}{db} = 1$. \n", + "\n", + "So the derivative $\\frac{dy}{db}$ simplifies to:\n", + "\n", + "$$\\frac{dy}{db} = \\frac{b \\cdot 0 - a \\cdot 1}{b^2}$$\n", + "\n", + "Therefore, the derivative of $y$ with respect to $b$ is $-\\frac{a}{b^2}$.\n", + "\n", + "Therefore, the gradient of $f(a, b)$ with respect to $a$ is $\\frac{1}{b}$, and the gradient of $f(a, b)$ with respect to $b$ is $- \\frac{a}{b^{2}}$." + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "64b849b5-8523-4bb8-9dd2-dbc0e8c7c8a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class EWiseDiv(TensorOp):\n", + " \"\"\"Op to element-wise divide two nodes.\"\"\"\n", + "\n", + " def compute(self, a, b):\n", + " ### BEGIN YOUR SOLUTION\n", + " return a / b\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " a, b = node.inputs\n", + " return divide(out_grad, b), out_grad * negate(divide(a, power_scalar(b, 2)))\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def divide(a, b):\n", + " return EWiseDiv()(a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "1c775097-ab17-4b05-8557-d3dfd56f7869", + "metadata": { + "tags": [] + }, + "source": [ + "## Divide Scalar" + ] + }, + { + "cell_type": "markdown", + "id": "2618d629-f6ce-4039-b6aa-e13acf3f348b", + "metadata": {}, + "source": [ + "Let's denote the scalar as `c`, and `a` as the tensor being divided by the scalar. The operation can be described as `f(a) = a / c`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1/c`.\n", + "\n", + "This is the derivative of `f(a)` with respect to `a`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "We are given a function $f(a) = \\frac{a}{c}$, where $a$ is a tensor and $c$ is a scalar. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By using the power rule of differentiation, where the derivative of $a^n$ is $n \\cdot a^{n-1}$, we can rewrite $f(a)$ as $f(a) = c^{-1}a$. \n", + "\n", + "Now, we can differentiate this with respect to $a$:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (c^{-1}a) \\\\\n", + "&= c^{-1} \\frac{d}{da} (a) \\\\\n", + "&= c^{-1} \\\\\n", + "&= \\frac{1}{c}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\frac{1}{c}$." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0130023a-6178-4120-a6c9-3a0f81da0cdc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class DivScalar(TensorOp):\n", + " def __init__(self, scalar):\n", + " self.scalar = scalar\n", + "\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " return a / self.scalar\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return out_grad * (1/self.scalar)\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def divide_scalar(a, scalar):\n", + " return DivScalar(scalar)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "1c8d63e6-346b-4c6b-b770-6fabf6f85c80", + "metadata": { + "tags": [] + }, + "source": [ + "## Transpose" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "80d36def-d0b1-473b-bbcb-077877562b87", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Transpose(TensorOp):\n", + " def __init__(self, axes: Optional[tuple] = None):\n", + " self.axes = axes\n", + "\n", + " def compute(self, a):\n", + " # Swap the last two dimensions of the input tensor\n", + " if self.axes:\n", + " a = a.swapaxes(self.axes[0], self.axes[1])\n", + " else:\n", + " a = a.swapaxes(-2, -1)\n", + " return a\n", + "\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return transpose(out_grad,axes=self.axes)\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def transpose(a, axes=None):\n", + " return Transpose(axes)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "1a569dad-0144-47ba-86e5-6d40aef3b4ef", + "metadata": { + "tags": [] + }, + "source": [ + "## Reshape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "44c95c81-abd5-447b-a9fc-5fee081af854", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Reshape(TensorOp):\n", + " def __init__(self, shape):\n", + " self.shape = shape\n", + "\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " self.input_shape = a.shape\n", + " return array_api.reshape(a, newshape=self.shape)\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " # reshape gradient to match input shape\n", + " return reshape(out_grad, self.input_shape), \n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def reshape(a, shape):\n", + " return Reshape(shape)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "ee9293dd-c605-46b6-bc54-28e92ccca957", + "metadata": { + "tags": [] + }, + "source": [ + "## Broadcast" + ] + }, + { + "cell_type": "markdown", + "id": "974da9d9-1024-460e-b1ef-0060aa2d2900", + "metadata": { + "tags": [] + }, + "source": [ + "Here's the requested information with the appropriate formatting:\n", + "\n", + "#### Implementing Backward Pass for Broadcasting Operation\n", + "\n", + "Broadcasting is an operation that expands a tensor to a given shape by replicating its values along the new dimensions. The backward pass of broadcasting needs to \"undo\" this operation and sum up the gradients that have been duplicated due to the broadcasting operation.\n", + "\n", + "In the `gradient` function for the broadcasting operation, `out_grad` is the gradient tensor flowing back from further down the computational graph. This tensor has the same shape as the output of the broadcasting operation, i.e., the expanded or broadcasted shape. \n", + "\n", + "We need to sum up the gradients in `out_grad` along the dimensions that have been extended by the broadcasting operation, and return a gradient tensor that has the same shape as the original input tensor.\n", + "\n", + "To achieve this, the `gradient` function does the following:\n", + "\n", + "1. **Identify the singleton dimensions:** Singleton dimensions are dimensions of size 1 that were either expanded due to broadcasting or added when broadcasting to a higher rank tensor. We need to identify these dimensions so we know which axes to sum over in the backward pass.\n", + "\n", + "2. **Sum the gradients over singleton dimensions:** We sum the `out_grad` tensor over all the singleton dimensions. This effectively reverses the broadcasting operation, as it adds up all the gradients that were copied due to the broadcasting.\n", + "\n", + "3. **Reshape the gradient tensor:** We then reshape the resulting gradient tensor to ensure that it has the same shape as the input tensor. This is important because the gradient tensor must match the shape of the input tensor for it to be propagated correctly in the backward pass. \n", + "\n", + "The `gradient` function for the broadcasting operation can be implemented as follows:\n", + "\n", + "```python\n", + "def gradient(self, out_grad, node):\n", + " # Shape of the input tensor\n", + " in_shape = node.inputs[0].shape\n", + "\n", + " # Compute the difference in tensor ranks (dimensions) between the output and the input\n", + " rank_diff = len(self.shape) - len(in_shape)\n", + "\n", + " # Indices of singleton dimensions added by broadcasting to higher-rank tensor\n", + " new_singleton_dims = list(range(rank_diff))\n", + "\n", + " # Indices of singleton dimensions in the input tensor that were expanded by broadcasting\n", + " expanded_singleton_dims = [i + rank_diff for i, size in enumerate(in_shape) if size == 1]\n", + "\n", + " # Combine all indices of singleton dimensions\n", + " singleton_dims = new_singleton_dims + expanded_singleton_dims\n", + "\n", + " # Sum the out_grad tensor over all singleton dimensions to \"undo\" the broadcasting\n", + " grad = summation(out_grad, axes=tuple(singleton_dims))\n", + "\n", + " # Reshape the resulting gradient tensor to match the shape of the input tensor\n", + " grad = reshape(grad, in_shape)\n", + "\n", + " # Return the gradient tensor\n", + " return (grad,)\n", + "```\n", + "\n", + "Remember to validate the correctness of this implementation by performing gradient checking as suggested." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "e0c9ddf1-f95c-47f8-827e-1ef24a54a5ec", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([[3., 4.],\n", + " [5., 6.]]),\n", + " torch.Size([2, 2]))" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "\n", + "# Initialize a 2x2 tensor\n", + "A = torch.tensor([[1., 2.], [3., 4.]])\n", + "\n", + "# Initialize a scalar\n", + "b = torch.tensor([2.])\n", + "\n", + "# Perform the broadcasting operation\n", + "C = A + b\n", + "C, C.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "456adee2-5f2b-49c0-ad55-6be6605272c2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([10.])" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make sure to enable gradient computation for b\n", + "b = torch.tensor([2.], requires_grad=True)\n", + "\n", + "# Perform the broadcasting operation\n", + "C = A + b\n", + "\n", + "# Now let's assume some gradient coming from the next layer during backpropagation\n", + "grad_next = torch.tensor([[1., 2.], [3., 4.]])\n", + "\n", + "# Perform the backward pass\n", + "C.backward(grad_next)\n", + "\n", + "# Check the gradient for b\n", + "b.grad" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "30b02a2d-8771-4d07-92d6-8d7a8c91581e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class BroadcastTo(TensorOp):\n", + " def __init__(self, shape):\n", + " self.shape = shape\n", + "\n", + " def compute(self, a):\n", + " self.input_shape = a.shape\n", + " return array_api.broadcast_to(a, self.shape)\n", + "\n", + " def gradient(self, out_grad, node):\n", + " # Get the difference in ranks between the broadcasted tensor and the input tensor\n", + " rank_diff = len(self.shape) - len(self.input_shape)\n", + "\n", + " # Identify the new singleton dimensions that were added due to broadcasting\n", + " # to a higher rank tensor. These are dimensions that do not exist in the input tensor.\n", + " new_singleton_dims = list(range(rank_diff))\n", + "\n", + " # Identify the singleton dimensions in the input tensor that were expanded by broadcasting.\n", + " # We count from the end of the shape (-1 refers to the last dimension, -2 the second to last, and so on).\n", + " # This way, we correctly handle the cases where the input tensor and the broadcasted shape\n", + " # differ both in rank and in size along some dimensions.\n", + " expanded_singleton_dims = [i + rank_diff for i, size in enumerate(self.input_shape[::-1]) if size == 1]\n", + "\n", + " # Combine all indices of singleton dimensions\n", + " singleton_dims = new_singleton_dims + expanded_singleton_dims[::-1]\n", + " return (reshape(summation(out_grad,axes=tuple(singleton_dims)), self.input_shape),) # a deliberate tuple\n", + "\n", + "def broadcast_to(a, shape):\n", + " return BroadcastTo(shape)(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "6147e927-8cdb-4daf-b36d-e0d70c3f9bdb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([1., 1., 1.])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# Create a tensor and set requires_grad to True so that PyTorch will\n", + "# know to compute gradients with respect to this tensor.\n", + "x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)\n", + "\n", + "# Compute a sum.\n", + "y = x.sum()\n", + "\n", + "# Compute gradients.\n", + "y.backward()\n", + "\n", + "# The gradient of the sum with respect to x is a tensor of ones.\n", + "print(x.grad) # Outputs: tensor([1., 1., 1.])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "e8278167-e0cc-4add-963b-6625a69a165c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x:\n", + " tensor([[1., 2., 3.],\n", + " [4., 5., 6.]], requires_grad=True)\n", + "y:\n", + " tensor([5., 7., 9.], grad_fn=)\n", + "y.shape:\n", + " torch.Size([3])\n", + "loss:\n", + " tensor(21., grad_fn=)\n", + "Gradients:\n", + " tensor([[1., 1., 1.],\n", + " [1., 1., 1.]])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# Create a 2D tensor\n", + "x = torch.tensor([[1., 2., 3.], [4., 5., 6.]], requires_grad=True)\n", + "print(\"x:\\n\", x)\n", + "\n", + "# Sum the tensor along axis 0\n", + "y = x.sum(axis=0)\n", + "print(\"y:\\n\", y)\n", + "print(\"y.shape:\\n\", y.shape)\n", + "\n", + "# Let's define a scalar loss as the sum of all elements in y\n", + "loss = y.sum()\n", + "print(\"loss:\\n\", loss)\n", + "\n", + "# Backward pass\n", + "loss.backward()\n", + "\n", + "# Display gradients\n", + "print(\"Gradients:\\n\", x.grad)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "b392889a-5429-4d3a-85b5-a9f85ef83c43", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[2, 3]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_shape = list(x.shape)\n", + "new_shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "e53aaa97-8b5a-4531-b783-76974283195e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 3]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for axis in (0,): new_shape[axis] = 1\n", + "new_shape" + ] + }, + { + "cell_type": "markdown", + "id": "ada258a4-3602-4775-918c-7cf42664009d", + "metadata": {}, + "source": [ + "#### illustrate the reshaping and broadcasting of the gradient in the backward pass." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "2948bd0e-243b-4bc9-bed0-fa83e7b87f7e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[1., 2., 3.],\n", + " [4., 5., 6.]], requires_grad=True)" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "\n", + "# Create a 2D tensor\n", + "x = torch.tensor([[1., 2., 3.], [4., 5., 6.]], requires_grad=True)\n", + "x" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "e6ab630f-615d-4e9e-a8e4-40e7fee119ad", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([5., 7., 9.], grad_fn=)" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sum the tensor along axis 0\n", + "y = x.sum(axis=0)\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "e0b8b8ff-62c0-4e8d-bf38-e35af0626c73", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([1., 1., 1.]), torch.Size([3]))" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compute a dummy gradient for y\n", + "grad_y = torch.ones_like(y)\n", + "grad_y, grad_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "b75ffbce-b275-4065-87a4-44084c949717", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 3]" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reshape grad_y to match the dimensionality of x\n", + "new_shape = [1 if axis == 0 else size for axis, size in enumerate(x.shape)]\n", + "new_shape" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "6be202e7-d014-4e13-8fbc-081838d5cc71", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([[1., 1., 1.]]), torch.Size([1, 3]))" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reshaped_grad_y = grad_y.reshape(new_shape)\n", + "reshaped_grad_y, reshaped_grad_y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "c284089b-d156-498b-86e5-e4e362374c38", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([[1., 1., 1.],\n", + " [1., 1., 1.]]),\n", + " torch.Size([2, 3]))" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Broadcast the reshaped grad_y to match the shape of x\n", + "broadcasted_grad_y = reshaped_grad_y.expand_as(x)\n", + "broadcasted_grad_y, broadcasted_grad_y.shape" + ] + }, + { + "cell_type": "markdown", + "id": "c9577c15-db7c-49f7-8186-a26bd8f86ac8", + "metadata": {}, + "source": [ + "The `Summation` operation, when provided with the `axes` argument, sums over these axes and thereby reduces the rank of the tensor by the number of axes summed over. The backward pass needs to take this into account, as it needs to return a gradient tensor of the same shape as the input.\n", + "\n", + "Here is a simplified implementation of the correct version:\n", + "\n", + "The forward pass (`compute` method) is straightforward - it just computes the sum over the specified axes.\n", + "\n", + "In the backward pass (`gradient` method), the goal is to compute the gradient of the sum operation. Since every element of the input tensor contributes equally to the sum, the derivative of the sum with respect to each element is 1. However, since the sum operation may reduce the dimensionality of the tensor (when `axes` is not `None`), we need to account for this when computing the gradient.\n", + "\n", + "To do this, we first create a new shape, where the dimensions specified by `axes` are replaced by 1. We then reshape `out_grad` to this new shape. This essentially \"undoes\" the dimensionality reduction performed by the sum operation. Finally, we use `broadcast_to` to make the reshaped gradient tensor the same shape as the input tensor.\n", + "\n", + "This ensures that the gradient tensor is the correct shape, and that the gradient with respect to each element of the input tensor is correctly computed as 1." + ] + }, + { + "cell_type": "markdown", + "id": "ef27b570-77eb-4bd0-b49d-54a86a68d9e5", + "metadata": { + "tags": [] + }, + "source": [ + "## Summation" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "d74fcff2-2f12-4996-a6b2-2afdb5ccbbfc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Summation(TensorOp):\n", + " def __init__(self, axes: Optional[tuple] = None):\n", + " self.axes = axes\n", + "\n", + " def compute(self, a):\n", + " # Forward pass just computes the sum over the specified axes\n", + " return array_api.sum(a, self.axes)\n", + "\n", + " def gradient(self, out_grad, node):\n", + " # out_grad is the gradient of the output of this operation\n", + " # We need to \"undo\" the dimensionality reduction performed in the forward pass\n", + " # That's why we create a new shape, replacing the dimensions specified by self.axes with 1\n", + "\n", + " # Initialize new shape to be the same as the input shape\n", + " new_shape = list(node.inputs[0].shape)\n", + "\n", + " # If axes were specified, set those dimensions to 1 in the new shape\n", + " if self.axes:\n", + " for axis in self.axes: new_shape[axis] = 1\n", + " \n", + " # Reshape out_grad to the new shape\n", + " reshaped_grad = reshape(out_grad, new_shape)\n", + "\n", + " # Broadcast the reshaped out_grad to match the input shape\n", + " broadcasted_grad = broadcast_to(reshaped_grad, node.inputs[0].shape)\n", + "\n", + " # The gradient method needs to return a tuple, even though there's only one input\n", + " return (broadcasted_grad,)\n", + "\n", + "def summation(a, axes=None):\n", + " return Summation(axes)(a)" + ] + }, + { + "cell_type": "markdown", + "id": "908a1c6c-3c64-4520-a5f3-58c07df34559", + "metadata": { + "tags": [] + }, + "source": [ + "## Matrix Multiplication" + ] + }, + { + "cell_type": "markdown", + "id": "e1228339-e77c-4d71-b7a8-e095776dd2df", + "metadata": {}, + "source": [ + "Matrix multiplication, often denoted by \"matmul\" in some programming languages, refers to the process of multiplying two matrices together. However, in the context of calculus, it's more common to talk about the derivative of a function. \n", + "\n", + "When dealing with matrices, instead of talking about derivatives, we often discuss the Jacobian, which is a matrix of partial derivatives. If you have a function that takes a matrix as input and produces a scalar output, you could compute a gradient, which would be a matrix of the same shape as the input matrix.\n", + "\n", + "However, in the context of deep learning and backpropagation, you might be asking about the derivative of a matrix multiplication operation with respect to its inputs. This is often needed when you're training a neural network, because you need to compute gradients to update the weights.\n", + "\n", + "Let's denote the matrices as `A` and `B`, where `A` is a matrix of dimension `m x n` and `B` is a matrix of dimension `n x p`, and the result of the multiplication `C = A * B` is a matrix of dimension `m x p`.\n", + "\n", + "If we are to compute the derivative of `C` with respect to `A` (i.e., ∂C/∂A), each element in `A` affects all elements in its corresponding row in `C`. Thus, the derivative of `C` with respect to `A` is a four-dimensional tensor. In practice, it is common to work with the gradients in a reshaped or unrolled form to perform the necessary update steps in backpropagation.\n", + "\n", + "Similarly, if we are to compute the derivative of `C` with respect to `B` (i.e., ∂C/∂B), each element in `B` affects all elements in its corresponding column in `C`. Again, the derivative will be a four-dimensional tensor.\n", + "\n", + "In actual computation, if we have a scalar-valued loss function `L`, we would compute the gradient of `L` with respect to `A` (denoted as ∂L/∂A), which is the same shape as `A`. To compute this, we need to know the gradient of `L` with respect to `C` (denoted as ∂L/∂C), then:\n", + "\n", + "∂L/∂A = (∂L/∂C) * B^T (where * denotes matrix multiplication and B^T is the transpose of B)\n", + "\n", + "Similarly, to compute the gradient of `L` with respect to `B` (denoted as ∂L/∂B):\n", + "\n", + "∂L/∂B = A^T * (∂L/∂C)\n", + "\n", + "The details of this process can be quite involved and understanding it fully requires a good understanding of linear algebra and calculus. For more in-depth understanding, it would be beneficial to refer to a textbook or detailed resource on the subject, such as \"The Matrix Calculus You Need For Deep Learning\" by Terence Parr and Jeremy Howard.\n" + ] + }, + { + "cell_type": "markdown", + "id": "281024ba-899f-429e-9bf2-b683de2c0d2f", + "metadata": {}, + "source": [ + "The line `axes_to_sum_over = tuple(range(len(out_shape) - len(lhs_shape)))` is calculating which axes (dimensions) of the output gradient tensor (`out_grad`) need to be summed over when computing the gradient with respect to the left-hand side (`lhs`) input tensor.\n", + "\n", + "This is necessary when the rank (number of dimensions) of `out_grad` is larger than the rank of `lhs`. This can happen, for instance, when `lhs` is a matrix (2D tensor) and `out_grad` is a 3D tensor (which can result from batched matrix multiplication).\n", + "\n", + "The `range` function generates a sequence of integers from 0 up to (but not including) `len(out_shape) - len(lhs_shape)`. The `tuple` function then takes this sequence and turns it into a tuple. The result is a tuple of integers representing the axes to sum over.\n", + "\n", + "Here is a concrete example:\n", + "\n", + "Suppose we have a batched matrix multiplication where `lhs` is a matrix of shape `(m, n)`, and `out_grad` is a 3D tensor of shape `(b, m, n)`, where `b` is the batch size. \n", + "\n", + "In this case, `len(out_shape) - len(lhs_shape)` equals `1`, so `range(len(out_shape) - len(lhs_shape))` generates a sequence of integers from `0` to `1` (not inclusive), which is just `[0]`.\n", + "\n", + "So `axes_to_sum_over` will be `(0,)`, indicating that we need to sum over the first axis (the batch axis) of `out_grad` when computing the gradient with respect to `lhs`.\n", + "\n", + "This summing operation effectively accumulates the individual gradients for each item in the batch into a single gradient for the `lhs` matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "fd7dbe38-2479-454e-8162-4fc7d2c329a2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Suppose we have the following shapes for `lhs` and `out_grad`\n", + "m, n, b = 5, 7, 3\n", + "\n", + "# Let's create some tensors with these shapes\n", + "lhs = torch.randn(m, n) # lhs is a 2D tensor (matrix) of shape (m, n)\n", + "out_grad = torch.randn(b, m, n) # out_grad is a 3D tensor of shape (b, m, n)\n", + "\n", + "# Let's say `rhs` is another matrix that was involved in computing out_grad\n", + "rhs = torch.randn(n, m)\n", + "\n", + "# Now we want to compute the gradient of the loss with respect to `lhs`\n", + "# First, we transpose `rhs` and perform batched matrix multiplication with `out_grad`\n", + "# grad_product = torch.matmul(out_grad, rhs.t())\n", + "\n", + "# # Now we need to sum over the batch dimension\n", + "# axes_to_sum_over = (0,) # in PyTorch, we can also just use 0 instead of a tuple\n", + "# grad_wrt_lhs = grad_product.sum(dim=axes_to_sum_over)\n", + "\n", + "# # grad_wrt_lhs is now a tensor of shape (m, n), same as `lhs`\n", + "# print(grad_wrt_lhs.shape) # prints: torch.Size([5, 7])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "5324df91-93bf-451b-b5fe-b6952758ba35", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(torch.Size([3, 5, 7]), torch.Size([5, 7]), torch.Size([7, 5]))" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_shape, lhs_shape, rsh_shape = out_grad.shape, lhs.shape, rhs.shape\n", + "out_shape, lhs_shape, rsh_shape" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "9b7fee3d-a548-4117-bfb3-96fc5a896784", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(3, 2)" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(out_shape), len(lhs_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "d2ce16be-cd69-4686-8362-69e1ac7b9d53", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "range(0, 1)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rng = range(len(out_shape) - len(lhs_shape))\n", + "rng" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "c0751837-8d2e-454b-a0a0-26d7d82b3255", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0,)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuple(rng)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2ce762c-16fa-4cdc-b5cb-2a00d6998032", + "metadata": {}, + "outputs": [], + "source": [ + "axes_to_sum_over = tuple(range(len(out_shape) - len(lhs_shape)))\n", + "axes_to_sum_over" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "1a9849fc-7bdd-4f67-8e55-6e3216c08d46", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class MatMul(TensorOp):\n", + " def compute(self, a, b):\n", + " return array_api.matmul(a, b)\n", + "\n", + " def gradient(self, out_grad, node):\n", + " lhs, rhs = node.inputs\n", + " out_shape, lhs_shape, rhs_shape = out_grad.shape, lhs.shape, rhs.shape\n", + " \n", + " # compute gradient with respect to lhs\n", + " if len(lhs_shape) == len(out_shape):\n", + " grad_wrt_lhs = matmul(out_grad, transpose(rhs))\n", + " else:\n", + " axes_to_sum_over = tuple(range(len(out_shape) - len(lhs_shape)))\n", + " grad_wrt_lhs = summation(matmul(out_grad, transpose(rhs)), axes=axes_to_sum_over)\n", + " \n", + " # compute gradient with respect to rhs\n", + " if len(rhs_shape) == len(out_shape):\n", + " grad_wrt_rhs = matmul(transpose(lhs), out_grad)\n", + " else:\n", + " axes_to_sum_over = tuple(range(len(out_shape) - len(rhs_shape)))\n", + " grad_wrt_rhs = summation(matmul(transpose(lhs), out_grad), axes=axes_to_sum_over)\n", + " \n", + " return grad_wrt_lhs, grad_wrt_rhs\n", + "\n", + "def matmul(a, b):\n", + " return MatMul()(a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "f221494c-7d4b-4f9c-bc7f-244a587c9e2a", + "metadata": { + "tags": [] + }, + "source": [ + "## Negation" + ] + }, + { + "cell_type": "markdown", + "id": "f09826ab-2835-45c1-ab55-92eeeb1d02e4", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `Negate` operator:\n", + "\n", + "Let's denote `a` as the tensor being negated. The operation can be described as `f(a) = -a`.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = -1`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a) = -a$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (-a) \\\\\n", + "&= -1\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $-1$.\n", + "\n", + "WE starts by defining the function `f(a) = -a`, where `-` represents the negation operation. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `-1`. This means that the gradient of `f(a)` with respect to `a` is `-1`, which matches the behavior of the `Negate` operator." + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "fe81ac6f-7b2f-43b3-b5a7-d20818b40607", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Negate(TensorOp):\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " return -1 * a\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return negate(out_grad), \n", + " ### END YOUR SOLUTION\n", + "\n", + "def negate(a):\n", + " return Negate()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "2fc0094e-5c2f-44a7-a766-d3190cac9b96", + "metadata": { + "tags": [] + }, + "source": [ + "## Log" + ] + }, + { + "cell_type": "markdown", + "id": "56c56d27-5596-4487-801d-d0d3dfac2b93", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `Log` operator:\n", + "\n", + "Let's denote `a` as the tensor on which the logarithm is applied. The operation can be described as `f(a) = \\log(a)`, where `\\log` represents the natural logarithm.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1/a`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a) = \\log(a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (\\log(a)) \\\\\n", + "&= \\frac{1}{a}\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\frac{1}{a}$.\n", + "\n", + "\n", + "This document starts by defining the function `f(a) = \\log(a)`, where `\\log` represents the natural logarithm. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `1/a`. This means that the gradient of `f(a)` with respect to `a` is `1/a`, which represents the behavior of the `Log` operator." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "6fb52c72-9bb5-4d01-bd08-59cb37acfa3d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Log(TensorOp):\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " raise array_api.log(a)\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return (out_grad / node.inputs[0], )\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def log(a):\n", + " return Log()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "86fb3f80-08e8-46d1-8f45-f7f4a65a5f5a", + "metadata": { + "tags": [] + }, + "source": [ + "## Exp" + ] + }, + { + "cell_type": "markdown", + "id": "1baaecd5-acfc-4037-83a2-f7d1a56a19b5", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `Exp` operator:\n", + "\n", + "Let's denote `a` as the tensor on which the exponential function is applied. The operation can be described as `f(a) = \\exp(a)`, where `\\exp` represents the exponential function.\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = \\exp(a)`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a) = \\exp(a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By differentiating the function $f(a)$ with respect to $a$, we find:\n", + "\n", + "\\begin{align*}\n", + "\\frac{df}{da} &= \\frac{d}{da} (\\exp(a)) \\\\\n", + "&= \\exp(a)\n", + "\\end{align*}\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $\\exp(a)$.\n", + "\n", + "\n", + "We starts by defining the function `f(a) = \\exp(a)`, where `\\exp` represents the exponential function. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `\\exp(a)`. This means that the gradient of `f(a)` with respect to `a` is `\\exp(a)`, which represents the behavior of the `Exp` operator." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "7e3bb271-3961-409a-b450-64652a42345f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class Exp(TensorOp):\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " self.out = array_api.exp(a)\n", + " return self.out\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return out_grad * (exp(node.inputs[0])), \n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def exp(a):\n", + " return Exp()(a)" + ] + }, + { + "cell_type": "markdown", + "id": "ef92862d-2b89-44e6-b684-955e747bc7f9", + "metadata": {}, + "source": [ + "Certainly! Here's the proof and explanation for the derivative of the `ReLU` (Rectified Linear Unit) operator:\n", + "\n", + "Let's denote `a` as the tensor on which the ReLU function is applied. The ReLU function is defined as follows: \n", + "\n", + "\\[\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "\\]\n", + "\n", + "The function for the backward pass (i.e., the gradient) is `df/da = 1` if `a >= 0`, and `df/da = 0` if `a < 0`.\n", + "\n", + "The LaTeX document will look as follows:\n", + "\n", + "\n", + "We are given a function $f(a) = \\max(0, a)$, where $a$ is a tensor. Our task is to find the derivative of this function with respect to $a$.\n", + "\n", + "By considering the definition of the ReLU function, we can write $f(a)$ as:\n", + "\n", + "$$\n", + "f(a) = \n", + "\\begin{cases}\n", + "a, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Now, let's differentiate $f(a)$ with respect to $a$:\n", + "\n", + "$$\n", + "\\frac{df}{da} = \n", + "\\begin{cases}\n", + "1, & \\text{if } a \\geq 0 \\\\\n", + "0, & \\text{if } a < 0\n", + "\\end{cases}\n", + "$$\n", + "\n", + "Therefore, the gradient of $f(a)$ with respect to $a$ is $1$ if $a \\geq 0$, and $0$ if $a < 0$.\n", + "\n", + "\\end{document}\n", + "```\n", + "\n", + "This document starts by defining the function `f(a) = \\max(0, a)`, which represents the ReLU function. It then explains that when we differentiate `f(a)` with respect to `a`, we find that the derivative is `1` if `a >= 0`, and `0` if `a < 0`. This means that the gradient of `f(a)` with respect to `a` is `1` for positive values of `a` and `0` for negative values of `a`.\n", + "\n", + "Please note that the `gradient` method of the `ReLU` operator is not implemented in the provided code, as indicated by `NotImplementedError()`." + ] + }, + { + "cell_type": "markdown", + "id": "385a5e66-8a02-4d0a-8ffb-29cd421dfcd7", + "metadata": { + "tags": [] + }, + "source": [ + "## ReLU" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a9d69c61-2298-4942-aabb-12bf3d048cf3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# TODO\n", + "class ReLU(TensorOp):\n", + " def compute(self, a):\n", + " ### BEGIN YOUR SOLUTION\n", + " self.out = array_api.clip(a, a_min=0)\n", + " return self.out\n", + " ### END YOUR SOLUTION\n", + "\n", + " def gradient(self, out_grad, node):\n", + " ### BEGIN YOUR SOLUTION\n", + " return 1 * self.out\n", + " ### END YOUR SOLUTION\n", + "\n", + "\n", + "def relu(a):\n", + " return ReLU()(a)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}