diff --git a/minima/operators.py b/minima/operators.py
index e7b4e5a..0e947bf 100644
--- a/minima/operators.py
+++ b/minima/operators.py
@@ -535,7 +535,7 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor,]:
         The gradients with respect to the inputs.
         """
         a = node.children[0].compute_cached_data()
-        return out_grad * Tensor(a > 0)
+        return (out_grad * Tensor(a > 0), )
 
 def relu(a: Tensor) -> Tensor:
     """
@@ -754,7 +754,7 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor, ...]:
             Tuple[Tensor, ...]: The gradient with respect to the input tensor.
         """
         input_shape = node.children[0].shape
-        return reshape(out_grad, input_shape), 
+        return (reshape(out_grad, input_shape), )
 
 def reshape(a: Tensor, shape: Tuple[int, ...]) -> Tensor:
     """
@@ -932,7 +932,7 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor]:
         broadcasted_grad = broadcast_to(reshaped_grad, node.children[0].shape)
 
         # The gradient method needs to return a tuple, even though there's only one input
-        return (broadcasted_grad,)
+        return (broadcasted_grad, )
 
 
 def summation(a: Tensor, axes: Optional[tuple] = None) -> Tensor:
@@ -1004,7 +1004,7 @@ def gradient(self, out_grad: Tensor, node: Tensor) -> Tuple[Tensor]:
         sum_over = tuple([idx for idx in range(len(self.shape)) if self.shape[idx] != shape[idx]])
 
         # Finally, we reshape the gradient after summing over the appropriate dimensions to match `a`'s shape.
-        return reshape(summation(out_grad, sum_over), a_shape)
+        return (reshape(summation(out_grad, sum_over), a_shape), )
 
 def broadcast_to(a: Tensor, shape: Tuple[int, ...]) -> Tensor:
     """
diff --git a/nbs/01_operators.ipynb b/nbs/01_operators.ipynb
index f04c63f..f07370c 100644
--- a/nbs/01_operators.ipynb
+++ b/nbs/01_operators.ipynb
@@ -1216,7 +1216,7 @@
     "        The gradients with respect to the inputs.\n",
     "        \"\"\"\n",
     "        a = node.children[0].compute_cached_data()\n",
-    "        return out_grad * Tensor(a > 0)\n",
+    "        return (out_grad * Tensor(a > 0), )\n",
     "\n",
     "def relu(a: Tensor) -> Tensor:\n",
     "    \"\"\"\n",
@@ -1675,7 +1675,7 @@
     "            Tuple[Tensor, ...]: The gradient with respect to the input tensor.\n",
     "        \"\"\"\n",
     "        input_shape = node.children[0].shape\n",
-    "        return reshape(out_grad, input_shape), \n",
+    "        return (reshape(out_grad, input_shape), )\n",
     "\n",
     "def reshape(a: Tensor, shape: Tuple[int, ...]) -> Tensor:\n",
     "    \"\"\"\n",
@@ -1892,11 +1892,11 @@
     {
      "data": {
       "text/plain": [
-       "tensor([[  0.7752,  -6.1767,  -5.3929,  -8.5221,   5.0633],\n",
-       "        [-10.6650,   8.4653,   3.0637,   5.5141,   4.5129],\n",
-       "        [  3.4482,  -5.3584,  -2.6963,   1.9794,   1.4696],\n",
-       "        [  7.3074,   0.4407,   2.0073,   1.7052, -11.3009],\n",
-       "        [  3.4645,  -3.7268,  -3.1002,   1.4796,  -2.3030]])"
+       "tensor([[-0.1309, -1.9203, -4.4179,  2.8422, -0.4453],\n",
+       "        [-1.5883, -8.1020, -6.7316, -1.3045,  0.6170],\n",
+       "        [-0.5317,  2.3444,  1.6038, -3.5786, -0.1689],\n",
+       "        [ 1.0831, -1.3743,  0.8485, -3.0593,  2.2023],\n",
+       "        [ 0.3071,  1.8321, -3.6827, -9.4409, -1.1884]])"
       ]
      },
      "execution_count": null,
@@ -2194,7 +2194,7 @@
     "        broadcasted_grad = broadcast_to(reshaped_grad, node.children[0].shape)\n",
     "\n",
     "        # The gradient method needs to return a tuple, even though there's only one input\n",
-    "        return (broadcasted_grad,)\n",
+    "        return (broadcasted_grad, )\n",
     "\n",
     "\n",
     "def summation(a: Tensor, axes: Optional[tuple] = None) -> Tensor:\n",
@@ -2485,7 +2485,7 @@
     "        sum_over = tuple([idx for idx in range(len(self.shape)) if self.shape[idx] != shape[idx]])\n",
     "\n",
     "        # Finally, we reshape the gradient after summing over the appropriate dimensions to match `a`'s shape.\n",
-    "        return reshape(summation(out_grad, sum_over), a_shape)\n",
+    "        return (reshape(summation(out_grad, sum_over), a_shape), )\n",
     "\n",
     "def broadcast_to(a: Tensor, shape: Tuple[int, ...]) -> Tensor:\n",
     "    \"\"\"\n",