__Deep learning with PyTorch: A 60 minute blitz__

1. [Load tools](#Load-tools)
1. [What is PyTorch?](#What-is-PyTorch?)
    1. [Tensors](#Tensors)
    1. [Operations](#Operations)
    1. [Numpy bridge](#Numpy-bridge)    
    1. [CUDA tensors](#CUDA-tensors)
1. [Autograd - automatic differentiation](#Autograd-automatic-differentiation)
1. [Neural networks](#Neural-networks)
    1. [Define the network](#Define-the-network)
    1. [Loss function](#Loss-function)
    1. [Backprop](#Backprop)
    1. [Update the weights](#Update-the-weights)
1. [Training a classifier](#Training-a-classifier)
    1. [Loading and normalizing CIFAR10](#Loading-and-normalizing-CIFAR10)
    1. [Define a CNN](#Define-a-CNN)
    1. [Define a loss function and optimizer](#Define-a-loss-function-and-optimizer)
    1. [Train the network](#Train-the-network)
    1. [Test the network on the test data](#Test-the-network-on-the-test-data)
    1. [Training on a GPU](#Training-on-a-GPU)
    
1. [Data parallelism](#Data-parallelism)
    1. [](#)
    1. [](#)

# Load tools

<a id = 'Load-tools'></a>

In [None]:
# Standard libary and settings
import os
import sys
import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# import PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.jit import script, trace
import torchvision
import torchvision.transforms as transforms

# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

# Magic functions
%matplotlib inline

# What is PyTorch?

<a id = 'What-is-PyTorch?'></a>

## Tensors

<a id = 'Tensors'></a>

In [None]:
# uninitialized 3 by 3 matrix
x = torch.empty(5, 3)
x

In [None]:
# randomly initialized matrix
x = torch.rand(5, 3)
x

In [None]:
# initialize matrix filled with zeros and of data type long
x = torch.zeros(5, 3, dtype=torch.long)
x

In [None]:
# construct tensor directly from input data
x = torch.tensor([5.5, 3])
x

In [None]:
# construct tensor based on existing tensor
# reuses properties of input tensor such as datatype, unless overridden
x = x.new_ones(5, 5, dtype=torch.double)
x

In [None]:
#
x = torch.randn_like(x, dtype=torch.float)
x

In [None]:
#
torch.Size([5, 3])

## Operations

<a id = 'Operations'></a>

In [None]:
# add two tensors
x = torch.rand(5, 3)
y = torch.rand(5, 3)
print(x + y)

In [None]:
# alternative addition syntax
print(torch.add(x, y))

In [None]:
# provide an output tensor as an argument
result = torch.empty(5, 3)
torch.add(x, y, out=result)
result

In [None]:
# in-place addition. in-place operators have an underscore suffix
y.add_(x)
y

In [None]:
# numpy-esque slicing
y[:, 1]

In [None]:
# resize/reshape
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # -1 is inferred from other dimensions

print("4 by 4\n")
print(x)
print("\n1 by 16\n")
print(y)
print("\n2 by 8\n")
print(z)

In [None]:
# access data point in one element tensor
x = torch.randn(1)
print(x)
print(x.item())

## Numpy bridge

<a id = 'Numpy-bridge'></a>

In [None]:
# convery torch tensor to numpy array
a = torch.ones(5)
print(a)

b = a.numpy()
print(b)

In [None]:
# value update reflected in both the tensor and array
a.add_(1)
print(a)
print(b)

In [None]:
# convert numpy array to torch tensor
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out=a)
print(a)
print(b)

## CUDA tensors

<a id = 'CUDA-tensors'></a>

In [None]:
#
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x, device=device)
    z = x.to(drive)
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))
else:
    print("No CUDA available")

# Autograd - automatic differentiation

Setting requires_grad = True in the creation of a tensor ensures that all operations on that tensor will be tracked. The gradients are accumulated into the .grad attribute.

To stop tracking, .detach() will prevent future computation.

To prevent tracking history (and memory consumption), a code block can be wrapped in a 'with torch.no_grad()' block.

Tensors also have an attribute called .grad_fn, which references a Function that has created the tensor. This does not apply to user-created tensors. For these, grad_fn is None.

To compute derivatives, the .backward() method needs to be called. If the tensor is larger than a scalar, then a gradient argument to the backward function. The gradient needs to match the shape of the tensor.


<a id = 'Autograd-automatic-differentiation'></a>

In [None]:
# user-created tensor x
x = torch.ones(2, 2, requires_grad=True)
x

In [None]:
# create tensor y through a torch operation
y = x + 2
y

In [None]:
# y was a created as a result of an operation, so it has a grad_fn
y.grad_fn

In [None]:
# more operations on y to create z
z = y * y * 3
out = z.mean()
print(z, out)

In [None]:
# the method requires_grad_() changes an existing tensor's requires_grad flag inplace
a = torch.randn(2, 2)
a = (a * 3) / (a - 1)
print(a.requires_grad)

a.requires_grad_(True)
print(a.requires_grad)

b = (a * a).sum()
print(b.grad_fn)

In [None]:
# backprop on variable 'out' which represents a scalar. no argument needed in backward()
out.backward()

In [None]:
# display gradient
x.grad

In [None]:
# vector-Jacobian product
x = torch.randn(3, requires_grad=True)
print(x)

y = x * 2
while y.data.norm() < 1000:
    #     print(y)
    y = y * 2

print(y)

In [None]:
# print x.grad (displays nothing)
x.grad

In [None]:
# pass a vector to the backward method as an argument
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)
print(x.grad)

In [None]:
# turn of gradient trackin history
print(x.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

# Neural networks

torch.nn constructs neural networks, and it relies on autograd to define models and differentiate the data within. A typical model contains nn.Module, which defines layers, and a forward method, which take an input and returns an output. A typical workflow is:

- Define the network layers and initialize learnable parameters (weights)
- iterate over a dataset of inputs
- pass input through the network layers
- compute the loss
- propagate gradients back through the networks parameters
- update the weight of the network by $w = w - learning\_rate \times gradient$

torch.nn only supports mini-batches, not single samples. For example, nn.Conv2d expects a 4D tensor of n_samples by n_channels by height by width. If a single sample is being passed in, input.unsqueeze(0) can be used to add a dummy batch dimension.

A recap of key points so far:

- torch.Tensor - a class the creates a multi-dimensional array that supports autograd operations such as backward(). It also holds the gradient wrt the tensor
- nn.Module - the neural network module that should be inherited by custom networks. Encapsulates parameters and has function for moving tensors to the GPU, exporting, saving, loading, etc.
- nn.Parameter - a kind of torch.Tensor. This is automatically registered as a parameter when assigned as an attribute to a nn.Module
- autograd.Function - this implements the forward and backward definitions of an autograd operation. Each Tensor operation creates at least one Function node that connects to connects back to functions that created a Tensor and encodes its history.

<a id = 'Neural-networks'></a>

## Define the network

<a id = 'Define-the-network'></a>

In [None]:
# define the network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5 by 5 convolution
        self.conv1 = nn.Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
        self.conv2 = nn.Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # max pool over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))

        # when the size is a square, only a single number is needed
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # omit batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

In [None]:
# the elarnable parameters of a model
params = list(net.parameters())
print(len(params))
print(params[0].size())

In [None]:
# insert a random 32 by 32 input
input = torch.randn(1, 1, 32, 32)  # 1 batch, 1 channel, 32 rows, 32 columns
out = net(input)
print(out)

In [None]:
# zero the gradient buffers of all parameters and backprops with random gradients
net.zero_grad()
out.backward(torch.randn(1, 10))  # pass in a 1 by 10 random tensor as the gradient
out

## Loss function

A loss function takes (output, target) as a pair of inputs and computes a value that estimates how far the output is from the target. There are many different loss function in the nn package.

<a id = 'Loss-function'></a>

In [None]:
#
output = net(input)
target = torch.randn(10)  # dummy target
target = target.view(1, -1)
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

In [None]:
# when loss.backward() is called, the entire graph is differentiated wrt the loss, and all Tensors in
# the graph that have requires_Grad = True will have their .grad accumulated with the gradient
# review a few steps backwards
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0].variable)

## Backprop

Calling backprop by loss.backward() will backpropagate the error and adjust the parameter weights accordingly. It is important to clear the existing gradients, otherwise the gradients will be accumulated to existing gradients.

<a id = 'Backprop'></a>

In [None]:
#
net.zero_grad()

print("conv1.bias.grad before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias.grad after backward")
print(net.conv1.bias.grad)

## Update the weights

<a id = 'Update-the-weights'></a>

In [None]:
# update the weights using stochastic gradient descent
learning_rate = 0.001
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [None]:
# utilize torch.optim to implement various optimization methods
# create the optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# goes in the training loop
optimizer.zero_grad()  # clear the gradients
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()  # apply the update

# Training a classifier

This section will implement the following steps

1. Load and normalize the CIFAR10 dataset
2. Define a CNN
3. Define a loss function
4. Train the network on the training data
5. Test the network on the test data

<a id = 'Training-a-classifier'></a>

## Loading and normalizing CIFAR10

<a id = 'Loading-and-normalizing-CIFAR10'></a>

In [None]:
# transform pipeline
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

In [None]:
# download data and create DataLoaders for train/test data sets
trainset = torchvision.datasets.CIFAR10(
    root="~/Desktop/data",
    train=True,
    download=True,
    transform=transform,
)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=4, shuffle=True, num_workers=2
)
testset = torchvision.datasets.CIFAR10(
    root="~/Desktop/data",
    train=False,
    download=True,
    transform=transform,
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=4, shuffle=False, num_workers=2
)
classes = (
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
)

In [None]:
# view sample training images


def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get random traning images
dataIter = iter(trainloader)
images, labels = dataIter.next()

# show
imshow(torchvision.utils.make_grid(images))

# print labels
print(" ".join("%5s" % classes[labels[j]] for j in range(4)))

## Define a CNN

<a id = 'Define-a-CNN'></a>

In [None]:
# define the network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # omit batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

## Define a loss function and optimizer

<a id = 'Define-a-loss-function-and-optimizer'></a>

In [None]:
# define a classification corss-entropy loss function and SGD with momentum
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

## Train the network

<a id = 'Train-the-network'></a>

In [None]:
# loop over the data loader, feed the inputs into the network and optimize
for epoch in range(2):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):

        # retrieve the inputs
        inputs, labels = data

        # zero out gradients
        optimizer.zero_grad()

        # forward, backward, optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print stats
        running_loss += loss.item()
        if i % 2000 == 1999:  # print every 2000 mini-batches
            print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print("Finished training")

## Test the network on the test data

<a id = 'Test-the-network-on-the-test-data'></a>

In [None]:
# display test set image
dataIter = iter(testloader)
images, labels = dataIter.next()

# print images
imshow(torchvision.utils.make_grad(images))
print("True label: ", "".join("%5s" % classes[labels[j]] for j in range(4)))

In [None]:
# create predictions using learned model
outputs = net(images)

In [None]:
#
_, predicted = torch.max(outputs, 1)
print("Predicted: ", "".join("%5s" % classes[predicted[j]] for j in range(4)))

In [None]:
# make predictions on entire dataset
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print("accuracy of network on all 10,000 test images: %d %%" % (100 * correct / total))

In [None]:
# which classes performed well, not well
class_correct = list(0.0 for i in range(10))
class_total = list(0.0 for i in range(10))
with torch.no_grad():
    for data in testloader:
        image, labales = data
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()

        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1

for i in range(10):
    print(
        "Accuracy of %5s: %2d %%"
        % (classes[i], 100 * class_correct[i] / class_total[i])
    )

## Training on a GPU

<a id = 'Training-on-a-GPU'></a>

In [None]:
#
device = torch.device('cuda"0' if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# inputs and laels to the device
inputs, labels = inputs.to(device), labels.to(device)

# Data parallelism

It is easy to use multiple GPUs with PyTorch to distribute the workload

<a id = 'Data-parallelism'></a>

In [None]:
# put model on GPU
# device = torch.device('cuda:0')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

copy tensor to GPU - this puts a new copy of the tensor on to a GPU

```python
mytensor = my_tensor.to(device)
```

set up parallel processing
```python
model = nn.DataPArallel(model)
```

In [None]:
# params
input_size = 5
output_size = 2
batch_size = 30
data_size = 100

In [None]:
# dummy dataset function
class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


rand_loader = DataLoader(
    dataset=RandomDataset(input_size, data_size), batch_size=batch_size, shuffle=True
)

In [None]:
# simple model
class Model(nn.Module):
    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, input):
        output = self.fc(input)
        print("\tIn model: input size", input.size(), "output size", output.size())
        return output

In [None]:
# instantiate model and DataPrallel
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

    model.to(device)

In [None]:
# run the model. if GPU count is <2, inputs will be = to batch_size
for data in rand_loader:
    input = data.to(device)
    output = model(input)
    print("\tOut: input size", input.size(), "output size", output.size())