In [1]:
import torch
import numpy as np

In [2]:
arr = np.array([1, 2, 3])

tensor = torch.tensor(arr)

print(tensor)

tensor([1, 2, 3], dtype=torch.int32)


In [3]:
arr = np.array([[1, 2, 3],[4,5,6]])

tensor = torch.tensor(arr)

print(tensor)

tensor([[1, 2, 3],
        [4, 5, 6]], dtype=torch.int32)


In [4]:
tensor = torch.zeros((3,4))
print(tensor)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [5]:
tensor = torch.ones((3,4))
print(tensor)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])


In [6]:
# default dtype is float32
tensor = torch.zeros((3,4),dtype=torch.float16)
print(tensor, tensor.dtype)

# we can also manually set the dtype
tensor = torch.zeros((3,4), dtype=torch.float16)
print(tensor)
tensor = torch.ones((3,4), dtype=torch.int32)
print(tensor)
tensor = torch.rand((3,4), dtype = torch.double)
print(tensor)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], dtype=torch.float16) torch.float16
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], dtype=torch.float16)
tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]], dtype=torch.int32)
tensor([[0.2108, 0.5876, 0.9157, 0.7846],
        [0.8629, 0.8118, 0.6189, 0.0231],
        [0.6050, 0.9401, 0.4860, 0.5094]], dtype=torch.float64)


### Shape, Size, and Reshaping

In [7]:
a = torch.rand(2, 3)

# Shape
print(a.shape)          # torch.Size([2, 3])
print(a.size())         # same

# Number of elements
print(a.numel())        # 6

# Reshape
print(a.view(3, 2))
print(a.reshape(3, 2))  # safer than view

# Add new dimension
print(a.unsqueeze(0).shape)   # (1, 2, 3)
print(a.unsqueeze(1).shape)   # (2, 1, 3)

# Remove dimension
b = a.unsqueeze(0)         # shape (1, 2, 3)
print(b.squeeze().shape)   # (2, 3)


torch.Size([2, 3])
torch.Size([2, 3])
6
tensor([[0.5881, 0.8420],
        [0.8792, 0.6873],
        [0.2656, 0.0879]])
tensor([[0.5881, 0.8420],
        [0.8792, 0.6873],
        [0.2656, 0.0879]])
torch.Size([1, 2, 3])
torch.Size([2, 1, 3])
torch.Size([2, 3])


### Basic Math Operations

In [8]:
a = torch.rand(2, 3)
b = torch.ones(2, 3)

# Addition
c = a + b
c = torch.add(a, b)

# Subtraction
d = a - b
d = torch.sub(a, b)

# Multiplication (element wise)
e = a * b
e = torch.mul(a, b)

# Division
f = a / b
f = torch.div(a, b)

# Power
p = a ** 2
p = torch.pow(a, 2)


### In-place Operations (modifies tensor)

All in-place functions end with `_`.

‚ö† In-place ops modify data that may be required for autograd‚Äîuse carefully.

In [9]:
a = torch.rand(2, 3)

a.add_(1)
a.mul_(2)
a.tanh_()


tensor([[0.9711, 0.9956, 0.9975],
        [0.9967, 0.9917, 0.9936]])

### Reduction Operations

In [10]:
a = torch.rand(2, 3)

print(a.sum())                # all elements
print(a.sum(dim=0))           # column-wise
print(a.sum(dim=1))           # row-wise

print(a.mean())               # global mean
print(a.mean(dim=0))          # column mean

print(a.std(dim=1))           # row std dev

print(a.min(), a.max())       # global min, max
print(a.min(dim=1))           # returns (values, indices)


tensor(3.4857)
tensor([1.4753, 0.8457, 1.1647])
tensor([2.3273, 1.1584])
tensor(0.5809)
tensor([0.7377, 0.4229, 0.5823])
tensor([0.2078, 0.4821])
tensor(0.0120) tensor(0.9484)
torch.return_types.min(
values=tensor([0.5451, 0.0120]),
indices=tensor([0, 1]))


### Matrix / Linear Algebra Ops

In [11]:
A = torch.rand(2, 3)
B = torch.rand(3, 4)

# Matrix multiplication
C = torch.matmul(A, B)
C = A @ B

# Transpose
print(A.t())         # swap last 2 dims

# Matrix determinant (square matrices)
M = torch.rand(3, 3)
print(torch.det(M))

# Matrix inverse
print(torch.inverse(M))


tensor([[0.8458, 0.3698],
        [0.9661, 0.7869],
        [0.5213, 0.0477]])
tensor(0.0048)
tensor([[  3.0583,  -9.7016,  23.2013],
        [  5.5415,  -7.3568,  11.0373],
        [ -8.0961,  16.4368, -26.6580]])


### Concatenation & Stacking

In [12]:
x = torch.rand(2, 3)
y = torch.rand(2, 3)

# Concatenate along rows (dim=0)
print(torch.cat((x, y), dim=0))

# Concatenate along columns (dim=1)
print(torch.cat((x, y), dim=1))

# Stack (creates new dimension)
print(torch.stack((x, y), dim=0))  # shape (2, 2, 3)


tensor([[0.2986, 0.4258, 0.3560],
        [0.3110, 0.1778, 0.4455],
        [0.8671, 0.2082, 0.1517],
        [0.3229, 0.7200, 0.1858]])
tensor([[0.2986, 0.4258, 0.3560, 0.8671, 0.2082, 0.1517],
        [0.3110, 0.1778, 0.4455, 0.3229, 0.7200, 0.1858]])
tensor([[[0.2986, 0.4258, 0.3560],
         [0.3110, 0.1778, 0.4455]],

        [[0.8671, 0.2082, 0.1517],
         [0.3229, 0.7200, 0.1858]]])


### Type Casting

In [13]:
a = torch.rand(3, 3)

print(a.float())
print(a.double())
print(a.int())
print(a.bool())


tensor([[0.1338, 0.8186, 0.8138],
        [0.2892, 0.6650, 0.3367],
        [0.6130, 0.3995, 0.5689]])
tensor([[0.1338, 0.8186, 0.8138],
        [0.2892, 0.6650, 0.3367],
        [0.6130, 0.3995, 0.5689]], dtype=torch.float64)
tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], dtype=torch.int32)
tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])


### Device Movement (CPU/GPU)

In [None]:
a = torch.rand(2, 3)

# Move to GPU
# a_gpu = a.to("cuda")

# Move back
# a_cpu = a_gpu.to("cpu")


In [27]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

### What is a *Variable* in PyTorch?

Before PyTorch 0.4, you had to wrap tensors in `Variable` to compute gradients:

```python
from torch.autograd import Variable
```

But **now `Tensor` itself acts like a Variable**.

üëâ So in modern PyTorch:

**Every tensor with `requires_grad=True` is a Variable.**

Meaning:

* PyTorch starts tracking all operations done on the tensor
* It can compute gradients via backpropagation (`.backward()`)

---

### What is `requires_grad`?

`requires_grad=True` tells PyTorch:

> ‚ÄúTrack everything done on this tensor so I can compute gradients later.‚Äù

Example:

In [28]:
x = torch.tensor(2.0, requires_grad=True)
y = x * 3
print(y)  # tensor(6., grad_fn=<MulBackward0>)

tensor(6., grad_fn=<MulBackward0>)


Notice the `grad_fn` ‚Äî it means the tensor is part of a computation graph.

---

### What is a Gradient?

The **gradient** is the **derivative** of your output w.r.t your input.

üëâ Used for training neural networks
üëâ Used by optimizers like Adam, SGD
üëâ Used for gradient descent to update weights

If `y = x * 3` then:

```
dy/dx = 3
```

#### `backward()` Method
The backward() method in Pytorch is used to calculate the gradient during the backward pass in the neural network. 
- If we <ins>do not call this backward()</ins> method then <ins>gradients are not calculated</ins> for the tensors.
- The gradient of a tensor is calculated for the one having requires_grad is set to True. 
- We can access the gradients using .grad. 
- If we do not call the backward() method or even for the tensors whose requires_grad is set to False, the result is None.

We can see this using PyTorch:

In [29]:
x = torch.tensor(2.0, requires_grad=True)
y = x * 3

# Compute gradient
y.backward()

print(x.grad)   # tensor(3.)

tensor(3.)


‚úî PyTorch automatically calculated dy/dx = 3

---

## Simple Example: y = x¬≤

Mathematically:

```
y = x^2
dy/dx = 2x
```

Let‚Äôs check:

In [30]:
x = torch.tensor(4.0, requires_grad=True)
y = x ** 2

y.backward()
print(x.grad)   # tensor(8.)

tensor(8.)


‚úî Correct ‚Üí 2 √ó 4 = 8

---

## Multiple Operations Example

In [31]:
x = torch.tensor(3.0, requires_grad=True)

y = x * 2          # 6
z = y + 5          # 11
out = z ** 2       # 121

out.backward()
print(x.grad)

tensor(44.)


Math:

```
y = 2x
z = y + 5 = 2x + 5
out = (2x + 5)^2

d(out)/dx = 2(2x+5)*2 = 4(2x+5)
At x=3 ‚Üí 4(11) = 44
```

PyTorch prints:

```
tensor(44.)
```

---

## Gradients only accumulate ‚Äî they don‚Äôt reset automatically

In [32]:
x = torch.tensor(2.0, requires_grad=True)
y = x * 3
y.backward()
print(x.grad)   # 3

# Calling backward again without zeroing
y = x * 3
y.backward()
print(x.grad)   # now 6 (accumulated)

tensor(3.)
tensor(6.)


üî• Always clear gradients before next backward:

In [33]:
x.grad.zero_()

tensor(0.)

### `torch.no_grad()` ‚Äî turn off gradient tracking

Used during:

* inference
* evaluation
* updating tensors without tracking

Example:

In [34]:
x = torch.tensor(5.0, requires_grad=True)

with torch.no_grad():
    x += 3

print(x)         # tensor(8.)
print(x.requires_grad)    # True

tensor(8., requires_grad=True)
True


Even though `x` has requires_grad=True, the operation inside `no_grad()` does not track history.

---

### Example with a small ‚Äúmodel weight‚Äù

Imagine a single weight **w**:

In [35]:
w = torch.tensor(1.0, requires_grad=True)

# simple prediction
y_pred = w * 5    # y = 5w

loss = (y_pred - 10)**2   # MSE loss

loss.backward()

print("Loss:", loss.item())
print("Gradient:", w.grad)  # tells us how to update w

Loss: 25.0
Gradient: tensor(-50.)


This is exactly what happens inside every neural network layer.

---

### Manual gradient descent

In [36]:
w = torch.tensor(1.0, requires_grad=True)

for i in range(5):
    y_pred = w * 5
    loss = (y_pred - 10)**2
    
    loss.backward()
    
    with torch.no_grad():
        w -= 0.1 * w.grad   # gradient descent update
    
    w.grad.zero_()
    
    print(i, w.item())

0 6.0
1 -14.0
2 66.0
3 -254.0
4 1026.0


This is the basic idea behind every optimizer like `SGD` or `Adam`.

---
`torch.no_grad()` is a **context manager** that tells PyTorch:

**‚ÄúDisable gradient tracking inside this block.‚Äù**

---

### Why use `torch.no_grad()`?

Because PyTorch normally tracks **every operation** on tensors with `requires_grad=True` to build a computation graph for backpropagation.

But during:

* **inference / prediction**
* **evaluation**
* **updating values manually**
* **copying tensors**
* **freezing model parameters**

we **don‚Äôt want gradients** and **don‚Äôt want a computation graph**.

So we wrap code in:

```python
with torch.no_grad():
    # code here will NOT track gradients
```

---

### Simple Example

##### Without `no_grad()`

```python
import torch

x = torch.tensor(5.0, requires_grad=True)
y = x * 3
print(y.grad_fn)   # <MulBackward0>
```

üîπ PyTorch tracked the operation
üîπ A computation graph was created
üîπ Memory is used
üîπ Gradients will be computed during backward

---

##### With `torch.no_grad()`

```python
import torch

x = torch.tensor(5.0, requires_grad=True)

with torch.no_grad():
    y = x * 3

print(y.grad_fn)   # None
```

‚úî No computation graph
‚úî No gradient tracking
‚úî Much less memory used
‚úî Faster

---

### Why is it SUPER important?

##### 1. **Inference is faster**

No gradients ‚Üí no graph building ‚Üí lower memory & faster computation.

##### 2. **Prevents unwanted gradient updates**

When you update weights manually:

```python
with torch.no_grad():
    w -= lr * w.grad
```

This ensures PyTorch **does not track** this update as part of the graph.

##### 3. **Safe evaluation of validation data**

```python
model.eval()
with torch.no_grad():
    preds = model(X_val)
```

---

### One-line definition

**torch.no_grad() temporarily turns off autograd so operations inside it do not create gradients or computation graphs.**


---

### Summary (Very Simple)

| Concept                            | Meaning                                  |
| ---------------------------------- | ---------------------------------------- |
| **Tensor with requires_grad=True** | A variable whose gradient is tracked     |
| **grad**                           | Stores ‚àÇoutput/‚àÇinput                    |
| **backward()**                     | Performs backpropagation                 |
| **grad_fn**                        | Shows which operation created the tensor |
| **no_grad()**                      | Turn off gradient tracking (inference)   |

