# PyTorch Basics

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import numpy as np

from collections import OrderedDict

## Tensor Basics

### CUDA

This is for running calculations on GPU with PyTorch.

In [2]:
t = torch.tensor([1,2,3]) # Create a tensor
t

tensor([1, 2, 3])

In [3]:
t = t.cuda() # Move the tensor to CUDA and reassign to t
t

tensor([1, 2, 3], device='cuda:0')

### Indexing

Indexing tensors is just like indexing NumPy arrays.

In [4]:
t = torch.tensor([1,2,3])
t[0]

tensor(1)

In [5]:
t = torch.tensor([
    [1,2,3],
    [4,5,6],
    [7,8,9],
])

In [6]:
t[1,0]

tensor(4)

In [7]:
t[1]

tensor([4, 5, 6])

In [8]:
t[0:2,0:2]

tensor([[1, 2],
        [4, 5]])

In [9]:
t[:,1]

tensor([2, 5, 8])

### Theoretical Information

A tensor is a multi-dimensional data structure.
A tensor is an abstraction of all the following objects: 
1. Scalars/numbers
2. Vectors/arrays
3. Matrices/2D-arrays
4. nd-arrays

There are three main tensor attributes:
1. Rank - The number of dimensions
2. Axes - A specific dimension of the tensor (the last axis contains the numbers, all other axes contain nd-arrays)
3. Shape - The length of each axis in the tensor in order

### PyTorch Tensor Attributes

The PyTorch tenspr object has the following attributes.

In [10]:
t = torch.Tensor()
type(t)

torch.Tensor

In [11]:
print(t.dtype) # Data-type stored in the tensor
print(t.device) # Device where the tensor exists
print(t.layout) # How memory is stored internally (default is strided)

torch.float32
cpu
torch.strided


### Creating Tensors

#### Without Data

In [12]:
torch.eye(3) # Identity tensor

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [13]:
torch.zeros(2,3) # Zero tensor

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [14]:
torch.ones(2,2,2) # Ones tensor

tensor([[[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]]])

In [15]:
torch.rand(1,5) # Random tensor

tensor([[0.0193, 0.6596, 0.7168, 0.1136, 0.6257]])

#### With Data

In [16]:
data = np.array([1,2,3])

In [17]:
t1  = torch.Tensor(data) # Constructor
t2 = torch.tensor(data) # Factory function
t3 = torch.as_tensor(data) # Factory function
t4 = torch.from_numpy(data) # Factory function

In [18]:
print(t1) # Does not preserve original dtype
print(t2) # Preserves original dtype
print(t3) # Preserves original dtype
print(t4) # Preserves original dtype

tensor([1., 2., 3.])
tensor([1, 2, 3], dtype=torch.int32)
tensor([1, 2, 3], dtype=torch.int32)
tensor([1, 2, 3], dtype=torch.int32)


In [19]:
data[0] = 0

In [20]:
print(t1) # Does not share memory with original data
print(t2) # Does not share memory with original data
print(t3) # Shares memory with original data
print(t4) # Shares memory with original data

tensor([1., 2., 3.])
tensor([1, 2, 3], dtype=torch.int32)
tensor([0, 2, 3], dtype=torch.int32)
tensor([0, 2, 3], dtype=torch.int32)


#### Best Options

In [21]:
t = torch.tensor([1,2,3], dtype=torch.float32) # The go-to option is to use the refactory function (and specify type if nescecery)
t = torch.as_tensor(data) # Use as_tensor if you want to take advantage of memory sharing for performance

## Tensor Operations

Tensor operations can be divided into 4 broad categories:
1. Reshaping Operations
2. Element-wise Operations
3. Reduction Operations
4. Access Operations

### Reshaping Operations

A reshaping operation modifies the shape of a tensor, but the data is preserved.

In [22]:
t = torch.tensor([
    [1,2,3,4],
    [5,6,7,8],
    [9,10,11,12],
])

#### Shape Information

In [23]:
t.shape

torch.Size([3, 4])

In [24]:
t.size() # Same as shape

torch.Size([3, 4])

In [25]:
len(t.shape) # Rank i.e number of dimensions

2

In [26]:
t.numel() # Number of elements

12

#### Reshape

In [27]:
t.reshape(1,12)

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]])

In [28]:
t.reshape(2,6)

tensor([[ 1,  2,  3,  4,  5,  6],
        [ 7,  8,  9, 10, 11, 12]])

In [29]:
t.reshape(4,3)

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]])

In [30]:
t.reshape(6,-1) # -1 makes it infer that dimension from the number of elements

tensor([[ 1,  2],
        [ 3,  4],
        [ 5,  6],
        [ 7,  8],
        [ 9, 10],
        [11, 12]])

In [31]:
t.reshape(2,2,3) # 2D to 3D

tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

In [32]:
t.reshape(12) # 2D to 1D

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

#### Squeeze and Unsqueeze

In [33]:
t = torch.tensor([1,2,3,4,5]).reshape(1,5)
t.shape

torch.Size([1, 5])

In [34]:
t = t.squeeze() # Removes any dimensions of length 1
t.shape

torch.Size([5])

In [35]:
t = t.unsqueeze(axis=0) # Adds back a dimension of length 1 at position 0 in the shape
t.shape

torch.Size([1, 5])

In [36]:
t = t.unsqueeze(axis=2)
t.shape

torch.Size([1, 5, 1])

In [37]:
t

tensor([[[1],
         [2],
         [3],
         [4],
         [5]]])

#### Flatten

In [38]:
t = torch.tensor([
    [1,2,3],
    [4,5,6],
    [7,8,9],
])

In [39]:
t.flatten() # Flattens the tensor into a 1D tensor of length numel

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

#### Concatenation and Stacking

In [40]:
t1 = torch.tensor([
    [1,2],
    [3,4]
])
t2 = torch.tensor([
    [5,6],
    [7,8]
])

In [41]:
torch.cat((t1, t2), axis=0) # Concatenates tensors along some existing axis

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])

In [42]:
torch.cat((t1,t2), dim=1) # Axis and dim are interchangable (almost everywhere!)

tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])

In [43]:
torch.stack((t1,t2)) # Creates a new axis to stack the tensors (tensors must all have same dimensions to begin with)

tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])

### Element-wise Operations (with Broadcasting)

An element-wise operation operates on corresponding elements between tensors. These only work when both the tensors are of the same type and on the same device.

In [44]:
t1 = torch.tensor([
    [1,2],
    [3,4]
])
t2 = torch.tensor([
    [9,8],
    [7,6]
])

In [45]:
t1+t2, t1*t2 # Arithmatic operations

(tensor([[10, 10],
         [10, 10]]),
 tensor([[ 9, 16],
         [21, 24]]))

In [46]:
t1-5, t2%2, t1+torch.tensor([1,2]) # Using broadcasting

(tensor([[-4, -3],
         [-2, -1]]),
 tensor([[1, 0],
         [1, 0]]),
 tensor([[2, 4],
         [4, 6]]))

In [47]:
# Last broadcasting explained
print(np.broadcast_to([1,2], t1.shape))# The 1D tensor is bradcasted to a 2D tensor
t1 + torch.tensor(np.broadcast_to([1,2], t1.shape)) # So under the hood it actually does this

[[1 2]
 [1 2]]


tensor([[2, 4],
        [4, 6]])

In [48]:
t1<2, t2>6, t1%2==0, t1<=t2 # Comparison

(tensor([[ True, False],
         [False, False]]),
 tensor([[ True,  True],
         [ True, False]]),
 tensor([[False,  True],
         [False,  True]]),
 tensor([[True, True],
         [True, True]]))

In [49]:
t1.abs(), t1.sqrt(), t1.neg(), t1.sigmoid() # Using functions

(tensor([[1, 2],
         [3, 4]]),
 tensor([[1.0000, 1.4142],
         [1.7321, 2.0000]]),
 tensor([[-1, -2],
         [-3, -4]]),
 tensor([[0.7311, 0.8808],
         [0.9526, 0.9820]]))

### Reduction Operations

A reduction operation on a tensor is an operation that reduces the number of elements contained within the tensor.

In [50]:
t = torch.tensor([
    [1,2,3,4],
    [5,6,7,8],
    [9,10,11,12],
], dtype=torch.float32)

#### Reduction Functions

In [51]:
t.sum(), t.prod(), t.mean(), t.std(), t.var()

(tensor(78.), tensor(4.7900e+08), tensor(6.5000), tensor(3.6056), tensor(13.))

In [52]:
t.sum(axis=1) # Squeezes out dimensions of length 1

tensor([10., 26., 42.])

In [53]:
t.sum(axis=1, keepdim=True) # Maintains the original number of dimensions

tensor([[10.],
        [26.],
        [42.]])

In [54]:
t.sum(axis=0)

tensor([15., 18., 21., 24.])

In [55]:
t.sum(axis=0, keepdim=True)

tensor([[15., 18., 21., 24.]])

#### Max and Argmax

In [56]:
t.max()

tensor(12.)

In [57]:
t.argmax() # Position of max value when flattened

tensor(11)

In [58]:
t.max(axis=1) # Maximum over a specific axis (returns max values and positions within that axis)

torch.return_types.max(
values=tensor([ 4.,  8., 12.]),
indices=tensor([3, 3, 3]))

In [59]:
t.argmax(axis=1)

tensor([3, 3, 3])

### Access Operations

An access operation is used to take the data in the tensor out of PyTorch for external use.

In [60]:
t = torch.tensor([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])

In [61]:
t.sum()

tensor(45)

In [62]:
t.sum().item() # Converts the tensor of length 1 into a number

45

In [63]:
t.tolist() # Converts the tensor into a list

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [64]:
t.numpy() # Converts the tensor into a numpy array

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

## Building Networks

We can build networks in PyTorch by using the `torch.nn` package. There are two methods to build a network: 
1. As a class that extends `nn.Module`
2. Using the `nn.Sequential` class

In [65]:
in_features = 512
out_features = 256
out_classes = 8

### Extending `nn.Module`

In PyTorch, neural networks are implemented using object oriented programming. Layers in a neural network (and even entire networks themselves) are implemented as objects that extend the ``nn.Module`` class. The transformations are done using the methods and the weights are stored as attributes within the class.

Note that both individual layers and entire networks both extend the same ``nn.Module`` class since a network can be thought of as just one big layer.

When we pass a tensor to our network as input, the tensor flows forward though each layer transformation until the tensor reaches the output layer. This process of a tensor flowing forward though the network is known as a forward pass.

Each layer has its own transformation (specified in the code) and the tensor passes forward through each layer. The composition of all the individual layer forward passes defines the overall forward pass transformation for the network.

The goal of the overall transformation is to transform or map the input to the correct prediction output class, and during the training process, the layer weights (data) are updated in such a way that cause the mapping to adjust to make the output closer to the correct prediction.

What this all means is that, every PyTorch ``nn.Module`` has a ``forward()`` method, and so when we are building layers and networks, we must provide an implementation of the ``forward()`` method. The forward method is the actual transformation. When implementing the forward method, we typically use functions from the ``nn.functional`` package.

In [66]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        # PyTorch automatically keeps track of the learnable parameters in layers defined as attributes
        self.flatten = nn.Flatten(start_dim=1)
        self.fc = nn.Linear(in_features=in_features, out_features=out_features)
        self.out = nn.Linear(in_features=out_features, out_features=out_classes)
        
    def forward(self, t):
        t = self.flatten(t)
        t = self.fc(t)
        t = self.out(t)
        return t

In [67]:
network = Network()
network

Network(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=512, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=8, bias=True)
)

### Using `nn.Sequential`

The Sequential class allows us to build PyTorch neural networks on-the-fly without having to build an explicit class. This make it much easier to rapidly build networks and allows us to skip over the step where we implement the `forward()` method. When we use the sequential way of building a PyTorch network, we construct the `forward()` method implicitly by defining our network's architecture sequentially.

A sequential module is a container or wrapper class that extends the nn.Module base class and allows us to compose modules together. We can compose any `nn.Module` with in any other ``nn.Module``.

This means that we can compose layers to make networks, and since networks are also ``nn.Module`` instances, we can also compose networks with one another. Additionally, since the Sequential class is also a `nn.Module` itself, we can even compose Sequential modules with one another.

At this point, we may be wondering about other required functions and operations, like pooling operations or activation functions. It turns out that all of the functions and operations in the `nn.functional` API have been wrapped up into `nn.Module` classes. This allows us to pass things like activation functions to Sequential wrappers to fully build out our networks in a sequential way. There are 3 ways to use the `nn.Sequential` method.

In [68]:
network = nn.Sequential(
    nn.Flatten(start_dim=1), # Flatten the individual images but keep batch dimension
    nn.Linear(in_features, out_features),
    nn.Linear(out_features, out_classes),
)
network

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=512, out_features=256, bias=True)
  (2): Linear(in_features=256, out_features=8, bias=True)
)

In [69]:
layers = OrderedDict([
    ('flat', nn.Flatten(start_dim=1)),
    ('hidden', nn.Linear(in_features, out_features)),
    ('output', nn.Linear(out_features, out_classes)),
])
network = nn.Sequential(layers)
network

Sequential(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (hidden): Linear(in_features=512, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=8, bias=True)
)

In [70]:
network = nn.Sequential()
network.add_module('flat', nn.Flatten(start_dim=1))
network.add_module('hidden', nn.Linear(in_features, out_features))
network.add_module('output', nn.Linear(out_features, out_classes))
network

Sequential(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (hidden): Linear(in_features=512, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=8, bias=True)
)

## Using GPU

Training a neural network involves a lot of computations that can be done in paralell, hence using a GPU (wherever available) can allow for dramatic decreases in training time.

### Tensors

In [71]:
t = torch.tensor([
    [1,2],
    [3,4]
])

In [72]:
t.device # Default device is cpu

device(type='cpu')

In [73]:
t = t.cuda() # Moves tensor to cuda
t.device

device(type='cuda', index=0)

In [74]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' # device is a string corresponding to the best available device
device

'cuda'

In [75]:
t = t.to(device) # Moves the tensor to whatever is stored in the `device` string
t.device

device(type='cuda', index=0)

Tensor operations between tensors between tensors can only be done when both tensors are on the same device.

### Networks

In [76]:
network = nn.Sequential(
    nn.Linear(in_features=64, out_features=32),
    nn.ReLU(),
    nn.Linear(in_features=32, out_features=16),
    nn.ReLU(),
    nn.Linear(in_features=16, out_features=10),
)
network

Sequential(
  (0): Linear(in_features=64, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=16, bias=True)
  (3): ReLU()
  (4): Linear(in_features=16, out_features=10, bias=True)
)

In [77]:
for name, param in network.named_parameters():
    print(f'{name}  \t  {param.device}  \t  {param.shape}')

0.weight  	  cpu  	  torch.Size([32, 64])
0.bias  	  cpu  	  torch.Size([32])
2.weight  	  cpu  	  torch.Size([16, 32])
2.bias  	  cpu  	  torch.Size([16])
4.weight  	  cpu  	  torch.Size([10, 16])
4.bias  	  cpu  	  torch.Size([10])


In [78]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
network.to(device) # Shifts all the parameter tensors to the device

for name, param in network.named_parameters():
    print(f'{name}  \t  {param.device}  \t  {param.shape}')

0.weight  	  cuda:0  	  torch.Size([32, 64])
0.bias  	  cuda:0  	  torch.Size([32])
2.weight  	  cuda:0  	  torch.Size([16, 32])
2.bias  	  cuda:0  	  torch.Size([16])
4.weight  	  cuda:0  	  torch.Size([10, 16])
4.bias  	  cuda:0  	  torch.Size([10])


A tensor can only be passed through a network when the tensor and the parameter tensors of the network are all on the same device.