__Learning PyTorch with examples__

1. [Load tools](#Load-tools)
1. [Tensors](#Tensors)
    1. [Numpy tensors](#Numpy-tensors)
    1. [PyTorch tensors](#PyTorch-tensors)
1. [Autograd](#Autograd)
    1. [PyTorch: tensors and autograd](#PyTorch:-tensors-and-autograd)
    1. [PyTorch: defining new autograd functions](#PyTorch:-defining-new-autograd-functions)
1. [torch.nn module](#torch.nn-module)
1. [torch.optim module](#torch.optim-module)
1. [PyTorch: custom torch.nn modules](#PyTorch:-custom-torch.nn-modules)

# Load tools

<a id = 'Load-tools'></a>

In [60]:
# Standard libary and settings
import os
import sys
import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# import PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.jit import script, trace
import torchvision
import torchvision.transforms as transforms

# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

# Magic functions
%matplotlib inline

# Tensors

<a id = 'Tensors'></a>

## Numpy tensors

Numpy isn't intended to do anything related to computational graphs, deep learning or gradients, but we can easily use numpy to create a neural network anyway. this example demonstrates how to create and fit a two-layer network by manually implementing the forward and backward passes through the network using only numpy operations

<a id = 'Numpy-tensors'></a>

In [61]:
# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input data and labels
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward pass: compute prediction for y
    h = x.dot(w1)  # 64 x 1000 multiplied by 1000 x 10 yields 64 x 100
    h_relu = np.maximum(h, 0)  # set everyth subzero element to 0
    y_pred = h_relu.dot(w2)

    # compute loss
    loss = np.square(y_pred - y).sum()
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss))

    # backpropagate to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Iter: 0, loss = 43601548.37041
Iter: 25, loss = 133819.75098
Iter: 50, loss = 16471.53566
Iter: 75, loss = 3594.05673
Iter: 100, loss = 983.49389
Iter: 125, loss = 298.90034
Iter: 150, loss = 96.25113
Iter: 175, loss = 32.04538
Iter: 200, loss = 10.91174
Iter: 225, loss = 3.77718
Iter: 250, loss = 1.32445
Iter: 275, loss = 0.46940
Iter: 300, loss = 0.16790
Iter: 325, loss = 0.06056
Iter: 350, loss = 0.02201
Iter: 375, loss = 0.00806
Iter: 400, loss = 0.00297
Iter: 425, loss = 0.00110
Iter: 450, loss = 0.00041
Iter: 475, loss = 0.00015


## PyTorch tensors

PyTorch makes many, many improvements toward the goal of executing neural network. It can utilize GPYs, which provide speedups of 50x or grater compare to numpy or similar libraries.

The code block below executes the same two layer network created above in the numpy example, but this time using PyTorch.

<a id = 'PyTorch-tensors'></a>

In [62]:
# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device("cpu")

# create random input data and labels
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # forward pass: compute prediction for y
    h = x.mm(w1)  # 64 x 1000 multiplied by 1000 x 10 yields 64 x 100
    h_relu = h.clamp(min=0)  # set every subzero element to 0
    y_pred = h_relu.mm(w2)

    # y_pred = F.relu(x.mm(w1)).mm(x2) # OR this works

    # compute loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss))

    # backpropagate to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

Iter: 0, loss = 39673300.00000
Iter: 25, loss = 128253.67188
Iter: 50, loss = 13413.47461
Iter: 75, loss = 2648.63599
Iter: 100, loss = 700.27631
Iter: 125, loss = 211.36861
Iter: 150, loss = 68.03728
Iter: 175, loss = 22.69126
Iter: 200, loss = 7.73286
Iter: 225, loss = 2.67220
Iter: 250, loss = 0.93227
Iter: 275, loss = 0.32749
Iter: 300, loss = 0.11565
Iter: 325, loss = 0.04108
Iter: 350, loss = 0.01475
Iter: 375, loss = 0.00545
Iter: 400, loss = 0.00215
Iter: 425, loss = 0.00094
Iter: 450, loss = 0.00046
Iter: 475, loss = 0.00026


# Autograd

<a id = 'Autograd'></a>

## PyTorch: tensors and autograd

<a id = 'PyTorch:-tensors-and-autograd'></a>

In [63]:
# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device("cpu")

# create random input data and labels
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# randomly initialize weights
# requires_grad = True ensures that gradients are computed with respect to these Tensors during the backward pass
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # forward pass: exactly the same ass the operation above but we don't need to manually
    # incorporate intermediate steps
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # compute loss on tensors, which is a Tensor of shape (1,). loss.item() retrieves the scalar
    loss = (y_pred - y).pow(2).sum()
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss.item()))

    # use autograd to perform the backward pass. this computes the gradient of loss wrt all Tensors with requires_grad = True
    # following this call w1.grad and w2.grad will hold the gradient of the loss wrt w1 and w2
    loss.backward()

    # manual update weights by gradient descent. wrapping in torch.no_grad() keeps the operation from being
    # tracked in the weight Tensors that have requires_grad = True. This doesn't need to be tracked in autograd
    # this method is a less efficient way that utilizes weight.data and weight.grad.data
    # torch.optim.SGD can also achieve this but more efficiently
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # manually zero out gradients
        w1.grad.zero_()
        w2.grad.zero_()

Iter: 0, loss = 38466576.00000
Iter: 25, loss = 136429.07812
Iter: 50, loss = 13017.18164
Iter: 75, loss = 1974.59363
Iter: 100, loss = 383.76691
Iter: 125, loss = 86.46795
Iter: 150, loss = 21.27151
Iter: 175, loss = 5.52313
Iter: 200, loss = 1.48440
Iter: 225, loss = 0.40827
Iter: 250, loss = 0.11414
Iter: 275, loss = 0.03236
Iter: 300, loss = 0.00939
Iter: 325, loss = 0.00290
Iter: 350, loss = 0.00103
Iter: 375, loss = 0.00044
Iter: 400, loss = 0.00022
Iter: 425, loss = 0.00013
Iter: 450, loss = 0.00008
Iter: 475, loss = 0.00005


## PyTorch: defining new autograd functions

Each builtin autograd operator contains two function that operate on Tensors:
- forward - compures the output Tensors from the input Tensors
- backward - receives the gradient of the output Tensors wrt the ground truth and computes the gradient of the input Tensors wrt the ground truth.

Custom autograd operators can be created and incorporated into networks by defining a subclass of torch.autograd.Function and creating the required forward and backward functions. To use the custom autograd operator, we need to create an instance of the operator and call it like a function, to which we pass Tensors containing the input data.

<a id = 'PyTorch:-defining-new-autograd-functions'></a>

In [64]:
#
class MyRelu(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        # ctx is a context object that can be used to stash data for backpropagation
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        # receives a Tensor containing the gradient of the los wrt the output. we need to compute
        # the gradient of the loss wrt the input
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

dtype = torch.float
device = torch.device("cpu")

# create random input data and labels
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# randomly initialize weights
# requires_grad = True ensures that gradients are computed with respect to these Tensors during the backward pass
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # apply operator using Function.apply
    relu = MyRelu.apply
    y_pred = relu(x.mm(w1)).mm(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum()
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss.item()))

    # use autograd to perform the backward pass.
    loss.backward()

    # manual update weights by gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # manually zero out gradients
        w1.grad.zero_()
        w2.grad.zero_()

Iter: 0, loss = 31708310.00000
Iter: 25, loss = 138472.06250
Iter: 50, loss = 14044.58008
Iter: 75, loss = 2221.72803
Iter: 100, loss = 422.07016
Iter: 125, loss = 89.12749
Iter: 150, loss = 20.24022
Iter: 175, loss = 4.85380
Iter: 200, loss = 1.21483
Iter: 225, loss = 0.31466
Iter: 250, loss = 0.08379
Iter: 275, loss = 0.02290
Iter: 300, loss = 0.00650
Iter: 325, loss = 0.00203
Iter: 350, loss = 0.00074
Iter: 375, loss = 0.00033
Iter: 400, loss = 0.00017
Iter: 425, loss = 0.00010
Iter: 450, loss = 0.00006
Iter: 475, loss = 0.00004


# torch.nn module

Computational graphs and autograd in the manner implemented above can be too low-level when fit large neural network. the torch.nn package serves the purpose of providing higher-level abstractions that more efficiently handle autograd operations. torch.nn defines a set of Modules which in general can be thought of as types of neural network layers. A Module receives input Tensors and computes output Tensors, and can also hold an internal state which can manage Tensors that contain learnable parameters. torch.nn also contains many loss functions

<a id = 'torch.nn-module'></a>

In [65]:
# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input data and labels
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# use torch.nn to define the model as a sequence of layers. nn.Sequential is a Module that contains other Modules, and these
# are applied in the specified order to produce an output.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out)
)

# define torch.nn loss function
loss_fn = torch.nn.MSELoss(reduction="sum")

learning_rate = 1e-6
for t in range(500):
    # perform forward pass by passing the input  to the model
    y_pred = model(x)

    # compute loss. this returns a Tensor containing the loss
    loss = loss_fn(y_pred, y)
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss.item()))

    # perform backward pass by computing the gradient of the loss wrt to all the learnable parameters in the model. internally
    # the parameters of each module are stored in Tensors where requires_grad = True. This will compute gradients for all
    # learnable parameters in the model
    loss.backward()

    # update weights using gradient descent.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

Iter: 0, loss = 712.22937
Iter: 25, loss = 552.48029
Iter: 50, loss = 313.91757
Iter: 75, loss = 179.24721
Iter: 100, loss = 122.44864
Iter: 125, loss = 180.76990
Iter: 150, loss = 233.33000
Iter: 175, loss = 136.27899
Iter: 200, loss = 165.84132
Iter: 225, loss = 276.81699
Iter: 250, loss = 193.24693
Iter: 275, loss = 232.00133
Iter: 300, loss = 214.97557
Iter: 325, loss = 164.68013
Iter: 350, loss = 222.17741
Iter: 375, loss = 205.73183
Iter: 400, loss = 153.95300
Iter: 425, loss = 228.58423
Iter: 450, loss = 199.14041
Iter: 475, loss = 128.64659


# torch.optim module

Weight updates up to this point have been updated by manually adjusting the Tensors holding learnable parameters. The torch.optim package offers a more efficient way to update the parameters, and includes implementations for sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.

<a id = 'torch.optim-module'></a>

In [66]:
# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input data and labels
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# use torch.nn to define the model as a sequence of layers. nn.Sequential is a Module that contains other Modules, and these
# are applied in the specified order to produce an output.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out)
)

# define torch.nn loss function
loss_fn = torch.nn.MSELoss(reduction="sum")

# define an optimizer. this will update the model weights. the first parameter tells the constructor which Tensors to update
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for t in range(500):
    # perform forward pass by passing the input  to the model
    y_pred = model(x)

    # compute loss. this returns a Tensor containing the loss
    loss = loss_fn(y_pred, y)
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss.item()))

    # before each backward pass all of the gradients need to be zeroed out. otherwise gradients are accumulated
    optimizer.zero_grad()

    # perform backward pass
    loss.backward()

    # call the step function to perform the weight updates
    optimizer.step()

Iter: 0, loss = 681.32642
Iter: 25, loss = 370.14603
Iter: 50, loss = 208.11354
Iter: 75, loss = 110.35217
Iter: 100, loss = 52.02501
Iter: 125, loss = 21.29395
Iter: 150, loss = 7.57197
Iter: 175, loss = 2.38376
Iter: 200, loss = 0.68513
Iter: 225, loss = 0.18359
Iter: 250, loss = 0.04648
Iter: 275, loss = 0.01133
Iter: 300, loss = 0.00269
Iter: 325, loss = 0.00062
Iter: 350, loss = 0.00014
Iter: 375, loss = 0.00003
Iter: 400, loss = 0.00001
Iter: 425, loss = 0.00000
Iter: 450, loss = 0.00000
Iter: 475, loss = 0.00000


# PyTorch: custom torch.nn modules

Custom modules can be created by subclassing nn.Module and defining a forward function.

<a id = 'PyTorch:-custom-torch.nn-modules'></a>

In [67]:
#
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# batch size, input dimension, hidden layer dimension, output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input data and labels
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# construct the model by instantiating the custom class
model = TwoLayerNet(D_in, H, D_out)

# define torch.nn loss function
criterion = torch.nn.MSELoss(reduction="sum")

# define an optimizer. this will update the model weights. the first parameter tells the constructor which Tensors to update
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for t in range(500):
    # perform forward pass by passing the input  to the model
    y_pred = model(x)

    # compute loss. this returns a Tensor containing the loss
    loss = criterion(y_pred, y)
    if t % 25 == 0:
        print("Iter: {}, loss = {:.5f}".format(t, loss.item()))

    # before each backward pass all of the gradients need to be zeroed out. otherwise gradients are accumulated
    optimizer.zero_grad()

    # perform backward pass
    loss.backward()

    # call the step function to perform the weight updates
    optimizer.step()

Iter: 0, loss = 702.34802
Iter: 25, loss = 386.69614
Iter: 50, loss = 214.55013
Iter: 75, loss = 110.25887
Iter: 100, loss = 49.78801
Iter: 125, loss = 19.40651
Iter: 150, loss = 6.43203
Iter: 175, loss = 1.89738
Iter: 200, loss = 0.54688
Iter: 225, loss = 0.16783
Iter: 250, loss = 0.05717
Iter: 275, loss = 0.02141
Iter: 300, loss = 0.00844
Iter: 325, loss = 0.00333
Iter: 350, loss = 0.00128
Iter: 375, loss = 0.00047
Iter: 400, loss = 0.00016
Iter: 425, loss = 0.00006
Iter: 450, loss = 0.00002
Iter: 475, loss = 0.00001
